* Data Preperation: Schnitzer and Watzinger (2020) Measuring Spillovers of Venture Capital, Review of Economics and Statistics
* Please read the readme file for required programs and data.

set matsize 11000
set more off

*********************************************************************************************************
* I) Venture Capital Raw Data
*********************************************************************************************************
{
* Read-in data from the Venture Expert Database & create industry identifier
{
import excel "data\VC_USA (1) Name.xls", sheet("Quick Search") cellrange(A8)  clear
ren A CompanyName 
ren B InvestmentDate 
ren C CompanyID 
ren D CompanyFoundedDate 
ren E SICCode 
ren F CompanyCity 
ren G NeworFollowonInvestment 
ren H RoundNumber 
ren I EquityAmountEstimatedUSDMil 
ren J FirstInvestmentReceivedDate 
ren K LastInvestmentReceivedDate 
ren L CompanyInvestmentStage1atRou 
ren M CompanyInvestmentStage2atRou 
ren N CompanyInvestmentStage3atRou 
ren O CompanyStatus 
ren P FirmName 
ren Q FirmCapitalUnderManagementUS
drop if RoundNumber==.
save "proc\merge1.dta", replace
forvalues i = 2/19 {
di `i'
import excel "data\VC_USA (`i') Date.xls", sheet("Quick Search") cellrange(A8)  clear
ren A CompanyName 
ren B InvestmentDate 
ren C CompanyID 
ren D CompanyFoundedDate 
ren E SICCode 
ren F CompanyCity 
ren G NeworFollowonInvestment 
ren H RoundNumber 
ren I EquityAmountEstimatedUSDMil 
ren J FirstInvestmentReceivedDate 
ren K LastInvestmentReceivedDate 
ren L CompanyInvestmentStage1atRou 
ren M CompanyInvestmentStage2atRou 
ren N CompanyInvestmentStage3atRou 
ren O CompanyStatus 
ren P FirmName 
ren Q FirmCapitalUnderManagementUS
drop if RoundNumber==.
local j = `i'-1
append using "proc\merge`j'.dta"
save "proc\merge`i'.dta", replace
}

save "proc\rawVentureDate.dta", replace
}

* Prepare Venture Expert data
{
use "proc\rawVentureDate.dta", clear
egen ID = group(CompanyName)
destring EquityAmountEstimatedUSDMil, force replace
keep if CompanyInvestmentStage1atRou=="Venture Capital"
drop if CompanyStatus=="LBO"
ren FirmName fund
ren CompanyName firmName
ren FirmCapitalUnderManagementUS SizeFund
ren EquityAmountEstimatedUSDMil Amount
ren CompanyInvestmentStage2atRo currentRound
ren SICCode SIC
destring SIC, force replace
gen year = year(date(InvestmentDate , "DMY"))
gen firstInvestment = year(FirstInvestmentReceivedDate)
gen lastInvestment = year(LastInvestmentReceivedDate)
ren RoundNumber fundingRoundNumber
drop LastInvestmentReceivedDate  FirstInvestmentReceivedDate InvestmentDate 
drop if Amount==. 


merge m:1 year using "data\deflator"
drop if _merge==2
drop _merge
replace Amount = Amount/pIndex

* Drop very large investment  - some company producing satellites for 100 mio's
gen maxA = 0 
replace maxA = Amount if fundingRoundNumber==1 
bysort ID: egen mA=max(maxA)
sum mA, detail
drop if mA>r(p99) | mA==0 
drop if  mA==0
drop mA maxA

gen success=0 
replace success =1 if CompanyStatus=="Went Public" | CompanyStatus=="In Registration" 

gen venture = 1
drop if SIC==.
replace SIC = floor(SIC)
egen fundID = group(fund)
save "proc\rawDataVenture_work.dta", replace

use "proc\rawDataVenture_work", clear
keep ID firstInvestment lastInvestment 
sort ID firstInvestment lastInvestment
duplicates drop ID, force
save "proc\firstLastInvestment", replace

use "proc\rawDataVenture_work", clear
// Name matching the VC companies to assignee database of NBER with the Java Program
keep ID firmName
order ID firmName
sort ID firmName
duplicates drop ID firmName, force
keep ID
duplicates drop ID, force
save "proc\IDVen.dta", replace
}

}
/*
*********************************************************************************************************************************************************************
* II) Patent Database - all kind of transformations - very time consuming
*********************************************************************************************************************************************************************
{

*********************************************************************************************************************************************************************
* Match patents to start-ups
*********************************************************************************************************************************************************************
{

insheet using "data\invpat.csv", clear
keep if country=="US"
keep patent city state assignee
ren assignee assignee_res
compress
duplicates drop
destring patent, force replace
compress
save temp, replace
use temp, clear
ren state stateInv
joinby patent using "data\pat76_06_assg.dta", unmatched(both)
keep if country=="US"
keep if pdpass!=.
replace state = stateInv if _merge==3
drop _merge
drop stateInv
save "proc\pats.dta", replace


* get state for each assignee from patent data
use "proc\pats", clear
keep pdpass state city assignee_res

duplicates drop
sort pdpass 
gen number = 1
ren state State
joinby pdpass using "data\assignee.dta"
compress
save "proc\pdpassState", replace



use "proc\rawDataVenture_work", clear
keep ID CompanyCity
ren CompanyCity city
keep ID city
joinby city using proc\cityStateCode, unmatched(master)
drop _merge
duplicates drop
save proc\IDCity, replace
}


**********************************************************
* Assignee data
**********************************************************
{
insheet using "data\assignee.csv", clear
* Asgype
* 0 individuals
* 1 unknown
* 2 corporations US
* 3 foreign corporations
* 4 indiviuals or unknown  (US)
* 5 individuals (foreign)
* 6 US government
* 7 foreign governments
* 8 US government
* 9 some kind of universities US
gen patentString = patent
destring patent, force replace
ren assignee standard_name
drop if standard_name==""
joinby patent using proc\pats.dta, unmatched(both)  update replace // Use old NBER file to get pdpasses
drop if pdpass==. // here I drop all unknown patents. do not know if this is a good idea
drop _merge
keep patent pdpass standard_name patentString
drop patent
ren patentString patent
compress
save "proc\pdpassPatent", replace


insheet using "data\assignee.csv", clear
cap drop ind
keep if asgseq==0
gen ind = regexm(lower(assignee), "[ a-zA-Z]*(college)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(university)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(regents)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(institute of )[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(research foundation)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(cornell)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(mayo )[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(ben gurion)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(yale)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(stanford)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(harvard)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(research corporation)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(clinic)[ a-zA-Z]*")
replace ind = 1 if regexm(lower(assignee), "[ a-zA-Z]*(hospital)[ a-zA-Z]*")
gen gov = (asgtype>5 & asgtype<9)
gen uni = (asgtype==9)
replace uni = uni+ind
gen corp = (asgtype==2) | (asgtype==3)
replace corp = 0 if ind==1
drop ind
save "proc\patentAsgtyp", replace
}
**********************************************************
* Create a pdpass match with established/venture/academic
**********************************************************
{
use data\assignee.dta, clear
joinby pdpass using  data\venturePatentName, unmatched(master)
gen venture = (_merge==3)
drop _merge
joinby pdpass using data\dynass, unmatched(master) update replace
gen established = (_merge==3)
drop _merge
gen academic = (cod==10|cod==11)
gen switcher = (venture==1 & established==1)
gen government = (cod>5 & cod<9)
*** Problems with 3M
replace pdpco1 = 7435 if uspto_assignee == 815358 
replace begyr1 = 1950 if uspto_assignee == 815358 
replace endyr1= 2006 if  uspto_assignee == 815358 
drop if pdpass==-19842
sort  pdpass ID venture pdpco1 begyr1 gvkey1 endyr1 venture established standard_name

duplicates drop pdpass ID venture pdpco1 begyr1 gvkey1 endyr1 venture established, force

save proc\assigneeMaster, replace
use proc\assigneeMaster, clear
keep if venture==1 | established==1 | academic==1 | government==1

save proc\assigneeCompaniesUniv, replace
keep if academic==1
keep pdpass academic
save proc\assigneeAcademic, replace
}
**********************************************************
* Create a pdpass match with established/venture/academic 
* Code a bit circular, but do not see how to code that better
**********************************************************
{
use proc\assigneeCompaniesUniv, clear
keep pdpass established academic venture switcher government
joinby pdpass using proc\pdpassPatent.dta, unmatched(master) update replace
keep patent venture established academic switcher government
collapse (max) venture established academic switcher government, by(patent)
drop if patent==""
save proc\patentCharacteristic, replace

*************************************************************************
* Use old NBER patent data to get the classification for subcategories rights
*************************************************************************
use proc\pats, clear
keep nclass  subcat
duplicates drop
drop if nclass==.
save proc\nclassSubcat, replace

**********************************************************
* Class data - patent nclass
**********************************************************


insheet using "data\class.csv", clear
keep if prim==1
destring class, force replace
*drop if class==.
*drop if patent==.
ren class nclass
keep patent nclass
duplicates drop patent, force // There are 2.948 patents (out of 4 Million) with non-unique primary patent class, seem to be a coding error (do not see a pattern).
merge 1:1 patent using proc\patentCharacteristic
foreach x in venture established academic {
replace `x'=0 if `x'==.
}
cap drop _merge
save "proc\patentClass", replace

**********************************************************
* Basic Data of Patents: appyear grant year etc.
**********************************************************

insheet using "data\patent.csv", clear
keep patent gyear appyear claims
merge 1:1 patent using proc\patentClass

save  proc\tempest, replace

* Note: here is much more information, in particular the names of the inventors and complete classes 
insheet using "data\invpat.csv", clear
bysort patent: egen minI = min(invseq)
keep if invseq==minI
keep patent state country class
compress
save proc\additionalPatentInformation, replace
}

**********************************************************
* Inventor structure
**********************************************************
{
insheet using "data\invpat_disambiguation.csv", clear
drop assignee
compress
drop upper country zipcode lat lon invseq gyear asgnum class invnum street
joinby patent using proc\pdpassPatent, unmatched(master)
drop if _merge!=3
drop _merge
joinby pdpass using proc\assigneeMaster, unmatched(master)
keep if ID!=.
drop _merge
replace established = 1 if cod==2 | cod==3
replace academic = 1 if cod==10 | cod==11
replace established = 0 if established ==.
replace venture = 0 if venture ==.
replace venture = 0 if appyear>begyr1 & begyr1!=.
replace established = 0 if appyear<=begyr1 & begyr1!=.
keep if venture==1
duplicates drop firstname lastname city, force
keep standard_name firstname lastname city state lower
compress
save proc\inventor_name, replace



insheet using "data\invpat_disambiguation.csv", clear
drop assignee
drop lower country zipcode lat lon invseq gyear asgnum class invnum state
drop firstname lastname street city
joinby patent using proc\pdpassPatent, unmatched(master)
drop standard_name
compress
drop _merge
save proc\invpat_disambiguation, replace

u proc\assigneeCompaniesUniv, clear
keep if ID!=.
keep ID pdpass  begyr1
joinby pdpass using proc\pdpassPatent, unmatched(master)
drop _merge
keep ID pdpass patent begyr1
compress
joinby patent using proc\invpat_disambiguation, unmatched(master)
keep appyear begyr* ID patent pdpass upper 
duplicates drop
save proc\assignee_patent, replace

u proc\assigneeMaster, clear
keep pdpass established academic venture government cod  begyr1
compress 
save proc\assignee_short, replace

u  proc\invpat_disambiguation, replace
keep pdpass upper patent appyear  
joinby pdpass using proc\assignee_short, unmatched(master)
keep pdpass established academic venture government cod begyr1 appyear upper
save proc\intermediate, replace


use proc\intermediate, clear
drop if pdpass ==.
compress 
gduplicates drop
replace established = (begyr1!=.)
replace academic = cod==10
gen academic1 = cod==11|cod==12|cod==13
replace established = 0 if venture==1

replace academic = 0 if academic==.
replace established = 0 if established ==.
replace venture = 0 if venture ==.

replace venture = 0 if appyear>begyr1 & begyr1!=.
replace established = 0 if appyear<=begyr1 & begyr1!=.
replace established = 0 if academic==1
replace venture = 0 if academic==1

joinby upper appyear using proc\assignee_patent, unmatched(both)

drop if appyear<1970
bysort ID upper: egen mIDapp1 = min(appyear)
replace mIDapp1 = 9999 if ID==.
bysort upper: egen mIDapp = min(mIDapp1)

global nov
foreach x in venture academic established {
bysort upper: egen p`x' = max((`x'*(appyear<mIDapp)) & mIDapp!=. )
bysort upper: egen f`x' = max((`x'*(appyear>mIDapp+1)) & mIDapp!=. )
global nov $nov p`x' f`x'
}
bysort ID: egen mIDapp_ID = min(appyear)
keep if mIDapp_ID == appyear
drop if ID==.
gen number_inventor = 1
gcollapse (sum) number_inventor (max) $nov, by(ID)

gen experience = pestablished + pacademic + pventure
gen firstTime= (experience==0)
drop experience


gen experience = festablished + facademic 
gen ftotal= (experience>0)
drop experience

foreach x in  pventure fventure pacademic facademic pestablished festablished firstTime ftotal {
ren `x' inv_`x'
}
ren inv_firstTime inventor_firstTime

save "proc\inventorStructureID", replace


insheet using "data\invpat_disambiguation.csv", clear
joinby patent using proc\patentAsgtyp, unmatched(master) update replace
drop _merge
joinby patent using proc\patentCharacteristic, unmatched(master) update replace
drop _merge
sort upper appyear

replace academic = 0 if academic==.
replace established = 0 if established ==.
replace venture = 0 if venture ==.
replace corp = 0 if academic==0
bysort upper appyear: egen macademic = max(academic)
bysort upper appyear: egen mcorp = max(corp)
gen sameYearSwitcher = (macademic==1) & (mcorp==1)
bysort upper: egen msameYearSwitcher=max(sameYearSwitcher)
replace sameYearSwitcher = msameYearSwitcher
sort upper appyear
gen firstTime = ( upper!= upper[_n-1] )
foreach x in academic  uni corp gov government established venture sameYearSwitcher   {
gen temp = 0
replace `x'=0 if `x'==.
replace temp = 1 if upper!= upper[_n-1]  & `x'>0
replace temp =  1 if (temp[_n-1] >0 | `x'>0)  & upper == upper[_n-1] 
gen p`x'  = temp
drop temp
}
preserve
keep patent firstname lastname
save inventor, replace
restore
keep patent firstTime pgov pgovernment pcorp puni pacademic pestablished pventure psameYearSwitcher
save "proc\inventorStructure", replace

use "proc\inventorStructure", replace
gen numberInventor = 1 
collapse (sum) numberInventor   (mean) firstTime psameYearSwitcher puni pcorp pgov  pgovernment pacademic pestablished pventure, by(patent) 
save "proc\inventorStructure1", replace



* Merge
use proc\tempest, clear
drop _merge
joinby patent using proc\additionalPatentInformation, unmatched(master) update replace
* Several inventors
cap drop _merge

joinby patent using proc\inventorStructure1, unmatched(master) update replace

cap drop _merge


foreach x in  pventure pestablished pacademic pgovernment pgov pcorp puni psameYearSwitcher firstTime government switcher academic established venture {
cap replace `x'=0 if `x'==.
}
compress
save proc\patent, replace



***********************************************
* Get app and gyears
************************************************
use proc\patent, clear	
keep patent appyear gyear
save proc\patentAppyearGyear, replace

}
**********************************************************
* Citation data
**********************************************************
{
insheet using "data\citation75_99.csv", clear
keep patent citation
ren patent citing
ren citation cited
save proc\citation75_99, replace

insheet using "data\citation00_10.csv", clear
keep patent citation
ren patent citing
ren citation cited
append using proc\citation75_99
compress
save proc\citationsRaw, replace

use  proc\citationsRaw, clear
ren citing patent 
joinby patent using  proc\pdpassPatent,  update replace
keep patent cited pdpass
ren pdpass pdpassTo
ren patent citing
ren cited patent
joinby patent using  proc\pdpassPatent,  update replace
gen selfCite =(pdpass==pdpassTo)
ren patent cited
keep citing cited selfCite
compress
save  proc\citations, replace

use  proc\citations, clear
ren citing patent 
joinby patent using  proc\patentAppyearGyear,  update replace
keep cited patent appyear selfCite
drop if appyear==.
ren appyear appyearCiting
ren patent citing
ren cited patent
joinby patent using  proc\patentAppyearGyear,  update replace
keep patent citing gyear appyearCiting selfCite
save citesYear, replace
drop if appyearCiting > gyear+3 
ren patent cited
save  proc\citationsCleaned, replace


global patentFeatures  venture established  academic government
* Create originality measures
use  proc\citations, clear
ren cited patent 
joinby patent using  proc\patentClass,  update replace
joinby patent using  proc\patentAppyearGyear,  update replace
replace switcher = 0 if switcher==.
replace government = 0 if government==.
compress
gen one = 1 
collapse (sum) one selfCite $patentFeatures, by(citing nclass)
compress
bysort citing: egen summer =  total(one)
gen share = one/summer
replace share = (share)^2
collapse (sum)  summer share one selfCite  $patentFeatures, by(citing)
compress
gen originality = (1-(1+(one-1)*share)/one)
foreach x in $patentFeatures selfCite {
gen `x'ShareBackward = `x'
}
ren citing patent 
ren one backwardCites
drop share summer  $patentFeatures
compress
save  proc\originality.dta, replace

* Create generality measures
use  proc\citationsCleaned, clear
ren citing patent 
joinby patent using  proc\patentClass, unmatched(master) update replace
replace switcher = 0 if switcher==.
replace government = 0 if government==.
compress
drop _merge
gen one=1
gen onePat = 1
sort cited
replace selfCite = onePat-selfCite
collapse (sum) one onePat selfCite $patentFeatures, by(cited nclass)
compress

bysort cited: egen summer =  total(one)
gen share = one/summer
replace share = (share)^2
collapse (sum) summer share one onePat  selfCite  $patentFeatures , by(cited)
gen generality = (1-(1+(one-1)*share)/one)
foreach x in $patentFeatures  selfCite{
gen `x'ShareForward = `x'
}
ren cited patent 
ren onePat forwardCites
drop share summer  one $patentFeatures
compress
save  proc\generality.dta, replace

* Merge these two
use  proc\generality.dta, clear
merge 1:1 patent using  proc\originality
drop _merge
compress
foreach x in generality originality forwardCites backwardCites  establishedShareBackward academicShareBackward ventureShareBackward selfCite {
replace `x' = 0 if `x'==.
}
compress
save  proc\generalityOriginality, replace
}
**************************************************************************
*Create Patent Master Data Set
**************************************************************************
{
*** Merge generality and originalilty to patent data
use  proc\patent.dta, clear
cap drop _merge

merge 1:1 patent using  proc\generalityOriginality
drop _merge
foreach x in generality originality forwardCites backwardCites  establishedShareBackward academicShareBackward ventureShareBackward selfCite  {
replace `x' = 0 if `x'==.
}

joinby nclass using  proc\nclassSubcat, unmatched(master) update replace
drop _merge
drop if nclass==.
compress

save  proc\patentRaw, replace

use  proc\patentRaw, clear
quietly {
gen patentCount=1
foreach x in   forwardCites backwardCites  selfCite {
bysort gyear nclass: egen summer = mean(`x')
gen `x'_scaled = `x'/(summer+1)
sum `x'_scaled if `x'_scaled>0, detail
replace `x'_scaled=r(p95) if `x'_scaled>r(p95)
drop summer 
}
foreach x in originality generality   {
bysort gyear nclass: egen summer = mean(`x')
gen `x'_scaled = `x'/(summer+1)
sum `x'_scaled if `x'_scaled>0, detail
replace `x'_scaled=r(p95) if `x'_scaled>r(p95)
drop summer 
}



foreach x in  patentCount {
bysort gyear nclass: egen summer =  total(`x')
gen summer1 = summer-`x'
gen `x'_scaled = `x'/(summer+1)

drop summer summer1
}
}
drop if appyear==.
sum appyear
compress
save  proc\patentsComplete, replace

use  proc\patentsComplete, clear
cap drop forwardCitesP99 forwardCitesP50 outstanding_scaled above_scaled size outstanding above

bysort gyear nclass: egen forwardCitesP99=pctile(forwardCites_scaled) , p(95)  
bysort gyear nclass : egen forwardCitesP50=pctile(forwardCites_scaled), p(50) 
bysort gyear nclass: egen size =  total(patentCount)
 
gen outstanding = (forwardCites_scaled>forwardCitesP99)
gen above = (forwardCites_scaled>forwardCitesP50)

foreach x in  outstanding above  {
bysort gyear nclass: egen summer =  total(patentCount)
gen summer1 = summer-`x'
gen `x'_scaled = `x'/(summer+1)
drop summer summer1
}

compress
save   proc\patentsComplete, replace


use  proc\patentsComplete, clear
keep if country=="US"
save  proc\patentsCompleteSelect, replace
}
*********************************************************************************************************************************************************************
*Complete Patents
*********************** **********************************************************************************************************************************************
{
global relevantFeatures $articles  firstTime  psameYearSwitcher puni pgov pgovernment pcorp pacademic pestablished pventure outstanding_scaled selfCite above_scaled outstanding  above switcher  patentCount  originality generality backwardCites forwardCites  originality_scaled generality_scaled backwardCites_scaled forwardCites_scaled patentCount_scaled   ventureShareBackward establishedShareBackward academicShareBackward ventureShareForward establishedShareForward academicShareForward
use  proc\pdpassPatent, clear
drop standard_name
duplicates drop
compress
save  proc\pdpassPatentJoin, replace

use  proc\assigneeCompaniesUniv, clear
compress
joinby pdpass using  proc\pdpassPatentJoin.dta, unmatched(master) update replace
cap drop _merge
save  proc\merger, replace

clear all
use  proc\merger, clear
duplicates drop
cap drop _merge
joinby patent using  proc\patentsCompleteSelect.dta, unmatched(master) update  // merge patent data
compress
save merger, replace
use merger, clear
cap drop _merge
gen gvkey=.
ren appyear year
forvalue i=1/5 {
replace gvkey = gvkey`i' if gvkey`i'~=. & year>=begyr`i' & year<=endyr`i'
}
save proc\patentsComplete_Full, replace
}


use data\main_file_novelty, clear
gen complexity = 0 
replace complexity = 1 if  cat==2 | cat==4 | subcat==32 | subcat==33
egen groupID= group(nclass0 appln_y)

merge 1:1 publn_nr using "data\claims"
keep if _merge==3
drop _merge

global nov
* control for tech x filing year and number of words for measures depending on word length
foreach x in novelty new_words  {
sum `x', d
areg `x' ib10.number_keywords, ab(groupID) 
predict `x'_res, d 
replace `x'_res = `x'_res+_b[_cons]
replace  `x'_res = `x' - `x'_res
global nov $nov  `x' `x'_res
} 
* control for tech x filing year
foreach x in   num_figures length_first_claim num_sheets   {
sum `x', d
areg `x' , ab(groupID) 
predict `x'_res, d 
replace `x'_res = `x'_res+_b[_cons]
replace  `x'_res = `x' - `x'_res
global nov $nov  `x' `x'_res
}

sum novelty_res, d
gen g_pat = novelty_res<r(p50)
gen g_pat_75 = novelty_res<r(p25)
sum num_figures, d
gen i_pat =  num_figures>r(p50)

sum length_first_claim_res, d
gen w_pat =  length_first_claim_res>r(p50)
gen w_pat_75 =  length_first_claim_res>r(p25)

gen gw_pat = g_pat * w_pat
gen ugw_pat = (g_pat==0) * w_pat
gen guw_pat = g_pat * (w_pat==0)


gen gi_pat = g_pat * i_pat
gen gui_pat = g_pat * (i_pat==0)
gen ugi_pat = (g_pat==0) * i_pat


gen giw_pat = g_pat*i_pat*w_pat
gen gist_pat = g_pat*i_pat*(w_pat==0)
gen guiw_pat = g_pat*(i_pat)*w_pat
gen guist_pat = g_pat*(i_pat==0)*(w_pat==0)


gen gcomp = g_pat*complexity
gen gwcomp = g_pat*w_pat*complexity
gen guwcomp = g_pat*(w_pat==0)*complexity
gen wcomp = w_pat*complexity
gen uwcomp = (w_pat==0)*complexity
gen comp = complexity

ren length_first_claim_res lf_claim_res

global nov $nov g_pat g_pat_75 i_pat w_pat lf_claim_res 
replace novelty  = novelty *100
sum novelty, d

sum novelty_res, d
replace novelty_res = r(p95) if novelty_res>r(p95)
replace novelty_res = r(p95)-novelty_res


save proc\\main_file_value_of_science_res, replace



use proc\patentsComplete_Full, replace

gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr ==.
merge m:1 publn_nr using   proc\\main_file_value_of_science_res
drop if _merge==2
drop _merge




preserve
keep if ID!=.
bysort ID: egen min_appln_y = min(appln_y)
keep if appln_y == min_appln_y
bysort ID: egen min_publn_nr = min(publn_nr )
keep if publn_nr  == min_publn_nr


foreach x in lf_claim_res novelty_res g_pat i_pat g_pat_75 w_pat w_pat_75 {
ren `x' `x'_ab
}
gcollapse (mean) *_ab , by(ID )
save proc\patentsScienceID, replace

restore
keep if gvkey!=.
bysort gvkey: egen min_appln_y = min(appln_y)
keep if appln_y == min_appln_y
bysort gvkey: egen min_publn_nr = min(publn_nr )
keep if publn_nr  == min_publn_nr

foreach x in  lf_claim_res novelty_res g_pat i_pat g_pat_75 w_pat w_pat_75    {
ren `x' `x'_ab
}
gcollapse (mean) *_ab , by(gvkey )
save proc\patentsScienceGvkey, replace



* Merge second order characteristics
use proc\patentsComplete_Full, clear


replace year = year+1

cap drop _merge

gen cat = floor(subcat/10)
gen complexity = 0 
replace complexity = 1 if  cat==2 | cat==4 | subcat==32 | subcat==33

foreach x in  cat subcat nclass {
bysort ID gvkey: egen mode_`x' = mode(`x'), maxmode
}
ren nclass nclass0

gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr ==.

joinby publn_nr using "data\claims", unmatched(master)
drop _merge

save proc\patentsComplete_Full_extend, replace

use proc\patentsComplete_Full, clear
keep ID gvkey year  patent forwardCites_scaled 
replace ID = 0 if ID==.
replace gvkey=0 if gvkey==.
duplicates drop
save proc\patentsComplete_merge, replace

use proc\citationsRaw, clear
ren citing patent 
joinby patent using proc\patentsComplete_merge, unmatched(master) update replace

foreach x in  ID gvkey forwardCites_scaled year _merge{
ren `x' `x'1
}
*  Marked by 1 means it is citing
ren patent citing
ren cited patent
joinby patent using proc\patentsComplete_merge, unmatched(master) update replace
ren patent cited
save proc\temp, replace

use proc\temp, clear
drop if ID==. & ID1==. & gvkey==. & gvkey1==.
drop if ID==0 & ID1==0 & gvkey==0 & gvkey1==0

foreach x in ID ID1 gvkey gvkey1  year year1 {
replace `x' = 0 if `x'==.
}
bysort ID1 gvkey1 year1: egen backwardCites = nvals(cited)
replace backwardCites=0 if backwardCites==.
bysort ID1 gvkey1 year1: egen patentCount = nvals(citing)
bysort citing: egen citation =  total( (_merge1==3)*(_merge==3))
replace citation = 1 if citation > 1
* Constructing weight11
collapse (mean) patentCount backwardCites (sum)  citation ,by(ID ID1 gvkey gvkey1 year year1)
compress
drop if (ID==ID1) & (gvkey==gvkey1)
gen citationPatent = (citation>0)
gen weight11 = (citation>=1)
gen weight12=citationPatent/(patentCount)
replace weight12 = 0 if weight12==.
gen weight13=citationPatent/(backwardCites)
replace weight13 = . if weight12==0
save proc\method1,replace 


*********************************************************************************************************************************************************************
*Venture Capital Patents
*********************************************************************************************************************************************************************
global relevantFeatures  complexity  firstTime   pacademic pestablished pventure  pgovernment numberInventor selfCite selfCite_scaled outstanding_scaled  above_scaled outstanding  above switcher  patentCount  originality generality backwardCites forwardCites  originality_scaled generality_scaled backwardCites_scaled forwardCites_scaled patentCount_scaled   ventureShareBackward establishedShareBackward academicShareBackward ventureShareForward establishedShareForward academicShareForward 

use proc\patentsComplete_Full_extend, clear
keep if venture==1
save proc\mergeVenturePatentsRaw.dta, replace // this file is used in the calculation of the shares
** Aggregate Company - nclass
preserve 
gcollapse (sum) $relevantFeatures, by(ID  nclass)
bysort ID: egen num = count(patentCount)
save proc\mergeVenturePatentsFull, replace
restore

** Aggregate Company - year
gcollapse (max) mode_nclass mode_subcat mode_cat (sum)   $relevantFeatures , by(ID year)
save proc\patentVentureMerge.dta, replace

use proc\mergeVenturePatentsRaw.dta, replace // this file is used in the calculation of the shares
cap drop publn_nr
gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr ==.
merge m:1 publn_nr using   proc\\main_file_value_of_science_res
keep if _merge==3
gcollapse (max) n_fig_max=num_figures general_scaled_max=generality_scaled orig_scaled_max=originality_scaled (min) novelty_min = novelty  novelty_res_min = novelty_res  lf_claim_min=length_first_claim lf_claim_res_min=lf_claim_res  , by(ID)
save proc\patentVentureMerge_averages.dta, replace


use proc\mergeVenturePatentsRaw.dta, replace // this file is used in the calculation of the shares
cap drop publn_nr
gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr ==.
merge m:1 publn_nr using   proc\\main_file_value_of_science_res
gcollapse (sum) novelty novelty_res lf_claim_res  length_first_claim  , by(ID year)
save proc\patentVentureMerge_additional.dta, replace


*********************************************************************************************************************************************************************
*Compustat Patents
*********************************************************************************************************************************************************************
use proc\patentsComplete_Full_extend, clear
keep if established==1
merge m:1 publn_nr using   proc\main_file_value_of_science_res
drop if _merge==2
drop _merge
keep if gvkey~=.
sort gvkey year
cap drop number
gen number=1
cap drop _merge
cap drop ID
save proc\mergeCompuPatentsRaw.dta, replace

** Aggregate Company - nclass
preserve 
gcollapse (sum) $relevantFeatures , by(gvkey nclass)
bysort gvkey: egen num = count(patentCount)
save proc\mergeCompuPatentsFull, replace
restore

** Aggregate Company - year
gcollapse  (mean)  length_first_claim process (max) mode_nclass mode_subcat mode_cat (sum) $relevantFeatures ,by(gvkey year)
save proc\patentCompuMerge.dta, replace



use proc\mergeCompuPatentsRaw.dta, replace
cap drop publn_nr
gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr ==.
merge m:1 publn_nr using   proc\main_file_value_of_science_res
gcollapse (sum) novelty novelty_res lf_claim_res  length_first_claim   , by(gvkey year)
drop if gvkey==0
save proc\patentCompusMerge_additional.dta, replace


use proc\patentsComplete_Full_extend, clear
keep if gvkey~=.
keep gvkey subcat 
gen subcat_number_ = 1
drop if subcat==.
gcollapse (sum) subcat_number_, by(subcat gvkey)
bysort gvkey: egen total = sum(subcat_number_)
gen subcat_share_ = subcat_number_/total
replace subcat_share_ = floor(subcat_share_*100)
drop if subcat_share_ ==0
drop total
reshape wide subcat_share_ subcat_number_, i(gvkey) j(subcat)
unab k: subcat_*
foreach x in `k' {
replace `x' = 0 if `x'==.
}
save  proc\tech_dist_gvkey, replace




*******************************************************************************
* Additional patent characterstics
***************************************************************************

* Inventor Data
import delimited "data\LinkedInventorNameLocData.txt", delimiter("|") clear encoding("utf-8")
ren v1 ID 
ren v2 pat 
ren v3 name 
ren v4 lat_lng 
ren v5 qual 
ren v6 loctype 
keep ID name pat lat_lng 
gen publn_auth = substr(pat, 1,2)
gen publn_nr = substr(pat, 3,.)
keep if publn_auth=="US"
destring publn_nr, force replace
drop if publn_nr==.
split lat_lng, parse(",")
ren lat_lng1 lat
ren lat_lng2 lng
drop lat_lng
destring lat lng, force replace
drop if lat==.

drop pat
save "proc\inventor_lat_lng_morrison", replace
keep lat lng
gduplicates drop
save "proc\inventor_lat_lng", replace


**************************************************************************
* Calculate network closeness based on coauthorship
**************************************************************************

* baseline_data
import delimited "data\LinkedInventorNameLocData.txt", delimiter("|") clear encoding("utf-8")
ren v1 ID 
ren v2 pat 
ren v3 name 
ren v4 lat_lng 
ren v5 qual 
ren v6 loctype 
gen publn_auth = substr(pat, 1,2)
gen publn_nr = substr(pat, 3,.)
keep if publn_auth=="US"
destring publn_nr, force replace
drop if publn_nr==.
split lat_lng, parse(",")
ren lat_lng1 lat
ren lat_lng2 lng
drop lat_lng
destring lat lng, force replace
drop if lat==.
joinby publn_nr publn_auth using data\main_file_novelty, unmatched(master)
egen inventor_ID = group(ID)
keep publn_nr inventor_ID appln_y ID
gduplicates drop
save "proc\inventor_lat_lng_morrison", replace
save "proc\master_ID_inventorID", replace

forvalues i = 1980(1)2006 {
use publn_nr inventor_ID appln_y using "proc\master_ID_inventorID"  if appln_y<`i', clear
ren inventor_ID inventor_ID2
save "proc\temp", replace
ren inventor_ID2 inventor_ID1
joinby publn_nr using  "proc\temp"
drop if inventor_ID1 ==inventor_ID2
drop publn_nr appln_y
gduplicates drop
save "proc\d1_`i'", replace
ren inventor_ID1 inventor_ID3
joinby inventor_ID2 using "proc\d1_`i'"
drop inventor_ID2
drop if inventor_ID1 ==inventor_ID3
gduplicates drop
ren inventor_ID3 inventor_ID2
append using  "proc\d1_`i'"
gduplicates drop
save "proc\d2_`i'", replace
}

use proc\patentsComplete_Full_extend, clear
keep ID gvkey patent
gen publn_nr = patent
destring publn_nr, force replace
drop if publn_nr==.
ren ID IDVenture
joinby publn_nr using "proc\master_ID_inventorID"
drop ID
ren IDVenture ID
drop publn_nr
ren inventor_ID inventor_ID1
gduplicates drop
replace ID = 0  if ID==.
replace gvkey = 0  if gvkey==.
save  "proc\master_gvkey_ID_inventorID1", replace
ren inventor_ID1 inventor_ID2
ren gvkey gvkey2 
ren ID ID2
save  "proc\master_gvkey_ID_inventorID2", replace


use "proc\master_gvkey_ID_inventorID2", clear
cap drop  patent
save  "proc\master_gvkey_ID_inventorID2", replace

forvalues i = 1980(1)2004 {
use  "proc\master_gvkey_ID_inventorID1" if appln_y<`i', replace
drop appln_y
bysort ID gvkey: egen nvals_inventorID1 = nvals(inventor_ID1)

joinby inventor_ID1 using "proc\d2_`i'"
joinby inventor_ID2 using  "proc\master_gvkey_ID_inventorID2"
keep if appln_y<`i'
bysort ID2 gvkey2: egen nvals_inventorID2 = nvals(inventor_ID2)
drop if ID==ID2 & gvkey == gvkey2
gduplicates drop
gen one = 1
gen year = `i'
gcollapse (sum) one (mean) nvals_inventorID1 nvals_inventorID2, by(ID ID2 gvkey gvkey2 year)
gen proximity_coauthor = (one/(nvals_inventorID1+nvals_inventorID2))^0.5
replace proximity_coauthor = floor(proximity_coauthor*1000)/1000
gduplicates drop
save "proc\bilateral_`i'", replace
}


forvalues i = 2005(1)2006 {
use  "proc\master_gvkey_ID_inventorID1" if appln_y<`i', replace
drop appln_y patent
bysort ID gvkey: egen nvals_inventorID1 = nvals(inventor_ID1)

joinby inventor_ID1 using "proc\d2_`i'"
joinby inventor_ID2 using  "proc\master_gvkey_ID_inventorID2"
keep if appln_y<`i'
bysort ID2 gvkey2: egen nvals_inventorID2 = nvals(inventor_ID2)
drop if ID==ID2 & gvkey == gvkey2
gduplicates drop
gen one = 1
gen year = `i'
gcollapse (sum) one (mean) nvals_inventorID1 nvals_inventorID2, by(ID ID2 gvkey gvkey2 year)
gen proximity_coauthor = (one/(nvals_inventorID1+nvals_inventorID2))^0.5
replace proximity_coauthor = floor(proximity_coauthor*1000)/1000
gduplicates drop
save "proc\bilateral_`i'", replace
}

use "proc\bilateral_2004", clear
save "proc\bilateral_2005", replace
save "proc\bilateral_2006", replace

use  "proc\bilateral_1980", clear
forvalues i = 1981(1)2006 {
cap append using "proc\bilateral_`i'"
}
keep ID ID2 gvkey gvkey2 year proximity_coauthor
gduplicates drop
save "proc\bilateral_coauthor", replace
use "proc\bilateral_coauthor", replace
keep if ID>0
keep ID proximity
gcollapse (mean) proximity, by( ID)
sum proximity, d

*gen receiver = proximity > r(p50)
ren proximity prox_bi
save "proc\receiver_bilateral", replace


* Hiring

use  "proc\master_gvkey_ID_inventorID1", replace
bysort ID gvkey: egen nvals_inventorID1 = nvals(inventor_ID1)
ren inventor_ID1 inventor_ID2
ren appln_y appln_y1
joinby inventor_ID2 using "proc\master_gvkey_ID_inventorID2"
bysort ID2 gvkey2: egen nvals_inventorID2 = nvals(inventor_ID2)
gen one = 1
gcollapse (sum) one (mean) nvals_inventorID1 nvals_inventorID2, by(ID ID2 gvkey gvkey2  appln_y)
gen proximity_move = (one/(nvals_inventorID1+nvals_inventorID2))^0.5
replace proximity_move = floor(proximity_move*1000)/1000
keep ID ID2 gvkey gvkey2 appln_y proximity_move
gduplicates drop
save "proc\bilateral_move", replace
use "proc\bilateral_move", replace
keep if ID2>0
keep ID2 proximity
gcollapse (mean) proximity, by( ID2)
sum proximity, d

ren ID2 ID

ren proximity prox
save "proc\receiver", replace




************************************
* Future inventor engagement
************************************


insheet using "data\invpat_disambiguation.csv", clear
*keep if country=="US"

drop assignee
drop lower country zipcode lat lon invseq gyear asgnum class invnum state
drop firstname lastname street city
joinby patent using proc\pdpassPatent, unmatched(master)
drop standard_name
compress
drop _merge
save proc\invpat_disambiguation, replace

u proc\assigneeCompaniesUniv, clear
keep if ID!=.
keep ID pdpass  begyr1
joinby pdpass using proc\pdpassPatent, unmatched(master)
drop _merge
keep ID pdpass patent begyr1
compress
joinby patent using proc\invpat_disambiguation, unmatched(master)
keep appyear begyr* ID patent pdpass upper 
duplicates drop
save proc\assignee_patent, replace

u proc\assigneeMaster, clear
keep pdpass established academic venture government cod  begyr1
compress 
save proc\assignee_short, replace

u  proc\invpat_disambiguation, replace
keep pdpass upper patent appyear  
joinby pdpass using proc\assignee_short, unmatched(master)
keep pdpass established academic venture government cod begyr1 appyear upper
save proc\intermediate, replace

use proc\intermediate, clear
drop if pdpass ==.
compress 
gduplicates drop
replace established = (begyr1!=.)
replace academic = cod==10|cod==11|cod==12|cod==13
replace established = 0 if venture==1

replace academic = 0 if academic==.
replace established = 0 if established ==.
replace venture = 0 if venture ==.

replace venture = 0 if appyear>begyr1 & begyr1!=.
replace established = 0 if appyear<=begyr1 & begyr1!=.
replace established = 0 if academic==1
replace venture = 0 if academic==1

gcollapse (max) academic established venture government, by(upper appyear)

gsort upper - appyear 

foreach x in academic venture government {
replace established =0 if `x'>0
}

foreach x in academic   government {
gen temp = 0
replace `x'=0 if `x'==. 
replace temp =  1 if (temp[_n-1] >0 | `x'[_n-1]>0)  & upper == upper[_n-1] 
gen f`x'  = temp
drop temp
}

foreach x in established    {
gen temp = 0
replace `x'=0 if `x'==. 
replace temp =  1 if (temp[_n-1] >0 | `x'[_n-1]>0)  & upper == upper[_n-1] 
gen f`x'  = temp
drop temp
}

foreach x in venture    {
gen temp = 0
replace `x'=0 if `x'==.
replace temp =  1 if (temp[_n-1] >0 | `x'[_n-1]>0)  & upper == upper[_n-1] 
gen f`x'  = temp
drop temp
}

bysort upper: egen tpatent = sum(1)
gen ftotal= festablished+fventure+facademic
bysort upper: egen last_patent = max(appyearstr)
foreach x in ftotal academic government established venture {
}

drop academic established government venture

joinby upper appyear using proc\assignee_patent, unmatched(both)
compress
save proc\inventorFullF, replace

use proc\inventorFullF, replace
keep patent fgovernment facademic festablished fventure ftotal tpatent 
gen publn_nr = patent
destring publn_nr, force replace
merge m:1 publn_nr using proc\\main_file_value_of_science_res
keep if _merge==3
global pnov
foreach x in fgovernment facademic festablished fventure ftotal tpatent  {
gen `x'g = `x'*g_pat
gen `x'w = `x'*w_pat
gen `x'gw = `x'*gw_pat
global pnov $pnov  `x'g `x'w `x'gw `x'
}


save proc\inventorStructureF, replace

use proc\inventorStructureF, clear
gen numberInventor = 1 
drop if patent==""

gcollapse  (mean) $pnov , by(patent) 
save proc\inventorStructureF, replace

use proc\inventorFullF, clear
cap drop _merge
drop if appyear>begyr1 & begyr1!=.
bysort ID upper: egen mapp = min(appyear)
bysort ID: egen mIDapp = min(appyear)

keep if mIDapp==appyear
drop if mapp==.


foreach x in  ftotal tpatent    fgovernment facademic festablished fventure  {
replace `x'=0 if `x'==.
}
drop if patent==""
bysort ID: egen total = sum(1)

duplicates drop patent ID upper, force
gen publn_nr = patent
destring publn_nr, force replace
merge m:1 publn_nr using proc\main_file_value_of_science_res
keep if _merge==3
global pnov
foreach x in    ftotal tpatent fgovernment facademic festablished fventure{
gen `x'g = `x'*g_pat
gen `x'w = `x'*w_pat
gen `x'gw = `x'*gw_pat
global pnov $pnov  `x'g `x'w `x'gw `x'
}



gcollapse  (max) $pnov, by(upper ID) 


gen numberInventor = 1 

collapse    (max)  $pnov, by(ID) 

foreach x in  $pnov  {
ren `x' inv_`x'
}
foreach x in $pnov {
replace inv_`x' = inv_`x'>0
}

save proc\inventorStructureIDF, replace



****************************************************************************************
* Claims data
****************************************************************************************
/*
* Claim.tsv is available for download https://www.patentsview.org/download/
import delimited "data\claim.tsv", colrange(2:)  clear 
destring dependent, force replace
keep if dependent==-1
ren patent_id publn_nr
destring publn_nr, force replace
replace text = lower(text)
gen process = regexm(text, "a method") | regexm(text, "a process") 
bysort publn_nr: egen m_process = max(process)
replace process = m_process
bysort publn_nr: egen min_seq = min(sequence)
keep if sequence == min_seq
gen length_first_claim = wordcount(text)
keep publn_nr process length
gduplicates drop
drop if publn_nr==.
gduplicates drop
* Widsorize at the 99% percentile
replace  length_first_claim = 3130 if  length_first_claim>3130 & length_first_claim!=.
save  "proc\claims", replace
*/

}
*/
*********************************************************************************************************************************************************************
* III) Venture Capital Instruments
*********************************************************************************************************************************************************************
{


* Area Codes
import delimited "data\area_codes.csv", varnames(1) clear 
ren statecode Statecode
ren state State
save "proc\area_codes", replace
ren areacode FundAreaCode
keep FundAreaCode Statecode
ren Statecode State
save "proc\codes.dta", replace

* Local of Start-up
import delimited "data\cities_states.csv", varnames(1) clear 
ren state State
joinby State using "proc\area_codes", unmatched(master)
keep city State Statecode
ren Statecode state
duplicates drop
sort city
gen number = 1
replace number = number[_n-1]+1  if  city==city[_n-1]
drop State
reshape wide state, i(city) j(number)
save "proc\cityStateCode", replace



* Fund Raising Data
import excel "data\FUNDS.xls", sheet("Fundraising Report") firstrow clear
keep if FundNation =="United States"
destring FundAreaCode, force replace
merge m:1 FundAreaCode using "proc\codes.dta"
drop _merge
preserve 
collapse (sum) FundSizeUSDMil, by(FundYear FirmName)
save fundYear, replace
restore
preserve
keep FirmName FundAreaCode State FundCity
sort  FirmName FundAreaCode State FundCity
duplicates drop FirmName FundAreaCode State, force
save funds.dta, replace // to be used later in the name match
restore
gen FundVenture = FundSizeUSDMil  * (FundType!="Buyout")
replace FundSizeUSDMil=FundSizeUSDMil* (FundType=="Buyout")
drop if (FundSizeUSDMil>2200 | FundSizeUSDMil<5) & FundVenture==0
collapse (sum) FundSizeUSDMil FundVenture, by(FundFoundedYear State)
drop if State==""
ren FundFoundedYear year
destring year, force replace
replace year = year+2
ren State state
ren FundSizeUSDMil FundSizeUSDMilRaw
label var FundSizeUSDMilRaw "Buy-out Fundraising"
label var FundVenture "Venture Fundraising"
save "proc\fundRaisingRaw.dta", replace

* Fund Raising Data - Cross State Distribution
import excel "data\FUNDS.xls", sheet("Fundraising Report") firstrow clear
keep if FundType=="Venture Capital" 
keep if FundNation =="United States"
destring FundAreaCode, force replace
merge m:1 FundAreaCode using proc\codes.dta
keep FirmName State
ren FirmName fund
save "proc\fundState",replace


* Patent Data- Cross-State Distribution Data
use "proc\mergeVenturePatentsRaw", clear
keep ID state
sort ID state
duplicates drop ID, force
save "proc\firmState.dta", replace

* Patent Data major state
use proc\mergeVenturePatentsRaw, clear
keep ID state
gen one = 1
collapse (sum)  one, by(ID state)
bysort ID: egen mOne = max(one)
keep if mOne==one
keep ID state

sort ID state
duplicates drop ID, force // 78 companies have the same number of patents in two states, pick by random.
save "proc\IDstate", replace


use proc\rawDataVenture_work.dta, clear
joinby ID using proc\firmState.dta
collapse (sum) Amount, by(fund state)
joinby fund using proc\fundState
save ventureFundDistAbsolut, replace
duplicates drop
collapse (sum) Amount, by(state State)
bysort State: egen summer =  total(Amount)
gen share = Amount/summer
keep State state share
* State is the Fund raising state, state is the getting state
order State state 
drop if State==""
save "proc\ventureFundDistribution", replace


* Merge cross-state distribution and lagged companies
use proc\fundRaisingRaw, clear
ren state State
joinby State using proc\ventureFundDistribution
gen FundSizeUSDMil = FundSizeUSDMilRaw*share
collapse (sum) FundSizeUSDMil, by(state year)
label var FundSizeUSDMil "Buy-out / Cross-state "
merge 1:1 year state using proc\fundRaisingRaw
label var FundSizeUSDMilRaw "Buy-out"
replace FundSizeUSDMilRaw = 0 if FundSizeUSDMilRaw==.
drop _merge
save "proc\fundRaising.dta", replace


******Create State specific instrument
use proc\IDven.dta, clear //Recursive
joinby ID using proc\mergeVenturePatentsRaw.dta, unmatched(master)
gen one=1
replace state="Rest" if state==""
collapse (sum) one, by(state ID) //Note that these are not year-specific
bysort ID: egen summer= total(one) // sum total over company
replace one = one/summer // no funding is proportional to share within state
sort ID state  
joinby state  using proc\fundRaising
ren FundSizeUSDMil FundSize
ren FundSizeUSDMilRaw FundSizeRaw
replace FundSize=0 if FundSize==.
replace FundSize = one * FundSize
replace FundSizeRaw = one * FundSizeRaw
collapse (sum) FundSize FundSizeRaw, by(ID year)
xtset ID year
joinby ID using proc\firmState.dta
label var FundSize "Buy-out / Cross-state / Firm specific"
label var FundSizeRaw "Buy-out / Firm specific"

save "proc\fundInstrument.dta", replace



}


************************************************************************************************************
* IV) Basic Dataset: Data for Venture Capital - Put all togehter
************************************************************************************************************
{
*** Balance the & add patents
use "proc\rawDataVenture_work.dta", clear
gen currentRoundID =1 if currentRound=="Seed"
replace currentRoundID =2 if currentRound=="Early Stage"
replace currentRoundID =3 if currentRound=="Expansion"
replace currentRoundID =4 if currentRound=="Later Stage"

collapse    (max) fundingRoundNumber currentRoundID (mean) pIndex  Amount*  success    SIC firstInvestment lastInvestment, by(ID firmName year CompanyStatus)


sort ID year
duplicates drop ID year, force
xtset ID year
tsfill, full

merge 1:1 ID year using "proc\patentVentureMerge.dta", update // Patent Data
drop if _merge==2
save proc\ventureDescriptive, replace
cap drop _merge 


global relevantFeatures  above_scaled outstanding_scaled above outstanding  originality generality backwardCites forwardCites patentCount_scaled patentCount originality_scaled generality_scaled backwardCites_scaled forwardCites_scaled patentCount_scaled   

foreach x in  $relevantFeatures {
replace `x' = 0 if `x'==.
}

ren firmName FirmName
gsort +ID + year
replace FirmName=FirmName[_n-1] if ID==ID[_n-1] & FirmName[_n-1]!=""

foreach x in SIC success firstInvestment currentRoundID {
replace `x' = `x'[_n-1] if ID==ID[_n-1] & `x'[_n-1]!=.  & `x'[_n]==.
}
* Start again here.
gsort +  ID - year
replace FirmName=FirmName[_n-1] if ID==ID[_n-1] & FirmName[_n-1]!=""

foreach x in SIC success firstInvestment {
replace `x' = `x'[_n-1] if ID==ID[_n-1] & `x'[_n-1]!=.
}

*** Create Presample Fixed Effects out of patent data
gen preSummer = 0
replace preSummer = forwardCites_scaled if year<firstInvestment  & year>=firstInvestment-5
bysort ID: egen preSforwardCites_scaled=  total(preSummer)
replace preSforwardCites_scaled = 0 if preSforwardCites_scaled==.
gen pre1978SforwardCites_scaled= preSforwardCites_scaled 
drop  preSummer

gen preSummer = 0
replace preSummer = patentCount if year<firstInvestment  & year>=firstInvestment-5
bysort ID: egen preSpatentCount=  total(preSummer)
replace preSpatentCount = 0 if preSpatentCount==.
gen pre1978SpatentCount= preSpatentCount
drop  preSummer

drop if pre1978SpatentCount>100


*** Add Fund Raising Instrument
cap drop _merge
merge 1:1 ID year using "proc\fundInstrument.dta"
drop if _merge==2
drop _merge

*** Add Fund Raising Instrument
cap drop _merge
merge m:1 year state using proc\fundRaising.dta
drop if _merge==2
drop _merge


* Merge state data
merge m:1 ID  using "proc\IDstate.dta" // Patent Data
drop if _merge==2






bysort ID: egen lastInvestmentImpute = min(lastInvestment)
replace lastInvestment = lastInvestmentImpute 
drop lastInvestmentImpute

**** Create Corrected Investment
bysort ID: egen summer =  total(patentCount)
gen outOfSample = (summer==0)
replace Amount= 0 if Amount==.
bysort outOfSample year SIC: egen sumAmount = total(Amount)
bysort year SIC: egen sumAmountTotal = total(Amount)
drop summer

gen correctionFactorAmount =sumAmount/sumAmountTotal
replace correctionFactorAmount = 1 if correctionFactorAmount==.
sort ID year
gen AmountCorr= Amount/correctionFactorAmount

*** Select out companies without a patent between 1979 and 1999
gen patentCount_tmp = patentCount * (year>=1979)*(year<2000)
bysort ID: egen sumPatentCount=sum(patentCount_tmp)
drop if sumPatentCount==0
drop sumPatentCount patentCount_tmp

cap drop _merge
merge m:1 ID  using "proc\inventorStructureID"
keep if _merge==3
drop _merge


merge m:1 ID  using "proc\patentsScienceID"
keep if _merge==3
drop _merge


merge m:1 ID   using "proc\patentVentureMerge_averages"
drop if _merge==2
drop _merge


merge m:1 ID  using "proc\receiver"
drop if _merge==2
drop _merge


merge m:1 ID  using "proc\receiver_bilateral"
drop if _merge==2
drop _merge



* First Stage Regression
{
bysort ID: egen summer = total(Amount)
drop if summer==0
drop summer


replace FundSize=0 if FundSize==.
replace FundSize = ln(FundSize+1)

global lister Amount AmountCorr
global Inv

drop if year<1975 
gen dum_pre1978SforwardCites_scaled=(pre1978SforwardCites_scaled==0)
foreach x in $lister {
gen dependent = ln(`x')
reg dependent i.SIC  i.year FundSize pre1978SforwardCites_scaled  dum_pre1978SforwardCites_scaled if  year>=firstInvestment  
test(FundSize)
predict `x'_p
replace `x'_p=exp(`x'_p)
replace `x'_p=0 if `x'_p==. | `x'==0
drop dependent
global Inv $Inv `x' `x'_p
}
replace FundSize=exp(FundSize)-1
}



* Generate cummulative Variables 
{
sort ID year
global Investment
foreach x in $Inv  {
gen temp = 0 
replace temp = `x'  if year==firstInvestment
replace temp  = `x' + temp[_n-1]*0.85  if ID[_n]==ID[_n-1] & year>firstInvestment
local name  =substr("`x'",7,.)
local name = "AmountCum"+"`name'"
ren temp `name'
global Investment $Investment `name'
}
}


* Data Selection &  Save the dataset
{
drop if year>lastInvestment+2
cap drop age
gen age = year-firstInvestment
drop if age<-5 
}
* Create Sample Split for Venture Capital Invesment
{
bysort ID: egen mComp = mean(complexity/patentCount)



unab k: *_ab
di "`k'"
global splitter_ab `k'
gen inv_firstTime = inventor_firstTime 
global splitter_inv   inv_festablished  inv_firstTime
global type_patent   g_pat g_pat_75 i_pat w_pat  
global splitter  $splitter_inv $splitter_ab    lf_claim_res_min  prox_bi prox  complexity        forwardCites_scaled generality_scaled originality_scaled  



**** Create Amount by Quintile of Innovativeness
foreach x in $splitter {
gen `x'_pP =`x'/(patentCount)
global splitter $splitter `x'_pP
}


foreach x in $splitter {
gen x_split = `x'
sum x_split, d
replace x_split = r(p99) if x_split>r(p99) & x_split!=.
replace x_split = r(p1) if x_split<r(p1) & x_split!=.
replace x_split = . if patentCount==0
bysort ID: egen cu= mean(x_split)
replace cu=. if firstInvestment>1999 | lastInvestment<1979
egen n_vals = nvals(cu)
if n_vals>2 {
xtile quintile=cu, n(2) 
}
else {
gen quintile = cu+1
}
forvalues i = 1/2 {
gen AC_`x'_`i' = (quintile==`i')*AmountCum
gen AC_`x'_`i'_p = (quintile==`i')*AmountCum_p
global Investment $Investment  AC_`x'_`i'  AC_`x'_`i'_p 
}
drop cu n_vals quintile x_split 
}




gen quality_quintile = g_pat_75_ab
foreach x in     festablished  {
forvalues i = 0/1 {
forvalues j = 0/1 {
gen AC_`x'_`i'_`j' = (inv_`x'==`i')*(quality_quintile==`j')*AmountCum
gen AC_`x'_`i'_`j'_p = (inv_`x'==`i')*(quality_quintile==`j')*AmountCum_p
global Investment $Investment  AC_`x'_`i'_`j'  AC_`x'_`i'_`j'_p 
}
}
}



gen higher = (pre1978SpatentCount>0)





forvalues i = 0/1 {
forvalues j = 0/1 {
foreach x in  firstTime   {
gen AC_pat_`x'_`j'_`i' = (higher==`j')*(inv_`x'==`i')*AmountCum
gen AC_pat_`x'_`j'_`i'_p = (higher==`j')*(inv_`x'==`i')*AmountCum_p
global Investment $Investment   AC_pat_`x'_`j'_`i'  AC_pat_`x'_`j'_`i'_p 
}
}
}



forvalues i = 0/1 {
forvalues j = 0/1 {
gen AC_gi_75_ab_`i'_`j' = (g_pat_75_ab==`i')*(i_pat_ab==`j')*AmountCum
gen AC_gi_75_ab_`i'_`j'_p = (g_pat_75_ab==`i')*(i_pat_ab==`j')*AmountCum_p
global Investment $Investment   AC_gi_75_ab_`i'_`j'  AC_gi_75_ab_`i'_`j'_p
}
}




drop age

gen age = year-firstInvestment
drop if age>10
drop if year>2006
*joinby ID  using D:\Dropbox\Spillovers\prog\choice_1, unmatched(both)

save proc\datasetPart1.dta, replace
}
}


*********************************************************************************************************
* V) CompuStat Data
*********************************************************************************************************
{

*Compustat Instruments
{
insheet using "data\uc.csv", clear
keep code rho_high year
save uc.dta, replace
use proc\mergeCompuPatentsRaw.dta, clear
keep gvkey year state 
gen number=1
sort gvkey year
gen one=1
drop if state==""
replace state="Rest" if state==""
collapse (sum) one, by(state gvkey) 
sort gvkey state  
bysort gvkey: egen sumOne= total(one)
replace one = one/sumOne
ren state code
joinby code using uc.dta
gen rdCosts = rho_high*one
collapse (sum) rdCosts, by(year gvkey)
save data\rdCosts, replace

}

* Compustat Basic data
{
insheet using "data\compustatNorthAmerica.csv", clear
drop if loc!="USA"

ren fyear year
merge m:1 year using data\deflator
keep if _merge==3
foreach x in  prcc_f   act ao aqi aqs at bkvlps capx capxv cogs csho dd dp dtea ib intan invfg invo invrm invt invwip ivaeq ivao lifr ls lt oiadp pi pidom pifo ppegt ppent pstk sale xad xint xintd xlr xpr xrd xrent xsga mkvalt {
replace `x' = `x'/pIndex
}
ren year fyear

bysort conm: egen cummer = total(xrd) // Drop companies without R&D
drop if cummer==0
drop cummer
drop if xrd==.
bysort conm: egen cummer = total(1) // Drop companies with less than 5 years in data
drop if cummer<5

keep sic conm gvkey fyear xrd sale emp city state ppent  n* ppent emp xrd  sale 

* Start cleaning procedure of Bloom, Schankerman and van Reenen (2013) Econometria
{
sort gvkey fyear sic sale
duplicates drop gvkey fyear, force
xtset gvkey fyear

ren fyear year
merge m:1 year using data\deflator
drop if _merge==2
drop _merge
foreach x in ppent sale xrd {
replace `x' = `x'/pIndex
}
sort gvkey year

duplicates drop gvkey year, force
gen venture = 0
gen num = gvkey
ren sale sales
by gvkey:gen dsales=(sales-sales[_n-1])/sales[_n-1]
by gvkey: gen demp=(emp-emp[_n-1])/emp[_n-1]
gen sales_emp=sales/emp
gen ppent_emp=ppent/emp
drop if (dsales<-0.66|dsales>2)&dsales~=.
drop if (demp<-0.66|demp>2)&demp~=.
drop if (ppent_emp<0.1|ppent_emp>1000)&ppent_emp~=.
drop if (sales_emp>2000|sales_emp<2)&(sales_emp~=.)

 
cap drop dy
qui by gvkey:gen dy=year-year[_n-1]
replace dy=1 if sales==.|sales[_n-1]==.
egen dym=total(dy>1),by(gvkey)
drop if dym>1
egen jumpyear=max(year*(dy>1&dy~=.)),by(gvkey)
replace sales=. if year<=jumpyear&jumpyear<1995
replace sales=. if year>=jumpyear&jumpyear>=1995
replace dy=1 if sales==.|sales[_n-1]==.
replace emp=. if sales==.
replace xrd=. if sales==.
so gvkey year
egen maxy=max(year*(sales~=.)),by(gvkey)
drop if year>maxy+1
drop maxy
drop num
egen num = group(gvkey)
cap prog drop jumpsclean
cap prog def jumpsclean

cap drop dyear 
cap drop zz
egen zz=group(`1')
ge old`1'=`1'
cap drop num
ge num=zz
so num year
qui by num:ge dyear=year-year[_n-1]
ge prob=1 if dyear>1&dyear~=.
replace prob=0 if prob==.
egen mprob=max(prob),by(num)
ge prob_yr=year if dyear>1&dyear~=.
replace prob_yr=0 if prob_yr==.
egen maxprob_yr=max(prob_yr),by(num)
egen nojumps=total(prob),by(num)
egen maxnojumps=max(nojumps)
local i = 1
while `i' <= maxnojumps  {
di `i'
replace num=num + 100000 if mprob==1&(year<maxprob_yr)
replace prob_yr = 0 if year==maxprob_yr
cap drop maxprob_yr
egen maxprob_yr=max(prob_yr),by(num)
local i = `i' + 1 
}

local j=num
egen temp=group(num)
replace num=temp
cap drop nojumps maxnojumps mprob prob_yr maxprob_yr temp zz
so num year
cap drop dyear
qui by num:ge dyear=year-year[_n-1]
noi di "***** This should always be unity ******"
tab dyear
end

jumpsclean num
cap drop noj
egen noj=count(num),by(num)
so num year
ren sales sale
}
* End Cleaning procedure of Bloom, Schankerman and van Reenen (2013) Econometrica 

save proc\rawDataCompu_work.dta, replace

* Merge instruments and patents
use proc\rawDataCompu_work.dta, replace
drop if year<1970 | year>2005

* Balance panel
xtset gvkey year
tsfill, full

ren conm FirmName
ren sic SIC

gsort +  gvkey + year
replace FirmName=FirmName[_n-1] if gvkey==gvkey[_n-1] & FirmName[_n-1]!=""

foreach x in SIC  {
replace `x' = `x'[_n-1] if gvkey==gvkey[_n-1] & `x'[_n-1]!=.
}

gsort +  gvkey - year
replace FirmName=FirmName[_n-1] if gvkey==gvkey[_n-1] & FirmName[_n-1]!=""

foreach x in SIC  {
replace `x' = `x'[_n-1] if gvkey==gvkey[_n-1] & `x'[_n-1]!=.
}


* Merge instrument
merge 1:1 gvkey year using data\rdCosts
keep if _merge==3
drop _merge

* Create Cummulative Variables
sort gvkey year
replace xrd=0 if xrd==.
gen xrdYear = (xrd>0)*year
replace xrdYear = . if xrdYear==0
bysort gvkey: egen minYear = min(xrdYear)

replace minYear = 1970 if minYear<1970 | minYear==.
gen cumRD=0
replace cumRD = xrd if year==minYear
replace cumRD = xrd+cumRD[_n-1]*0.85 if gvkey[_n]==gvkey[_n-1] & year>minYear

* First Stage regression
xtset gvkey year
replace sale = 0 if sale==.
gen lnXrd = ln(xrd+1)
gen lnSale = ln(sale+1)
replace SIC=0 if SIC==.
reg lnXrd i.year  i.SIC  rdCosts lnSale, cluster(gvkey)
predict xrd_p
replace xrd_p = exp(xrd_p)-1

replace xrd_p=0 if xrd_p==.
gen cumRD_p=0
replace cumRD_p = xrd_p if year==1978
replace cumRD_p = xrd_p+cumRD_p[_n-1]*0.85 if gvkey[_n]==gvkey[_n-1] & year>1978
replace cumRD_p = 0 if xrd==0

joinby year gvkey using proc\patentCompuMerge.dta,  unmatched(both)
drop if _merge==2
drop _merge

foreach x in xrd  above outstanding   originality generality forwardCites  patentCount originality_scaled generality_scaled  forwardCites_scaled    {
replace `x' = 0 if `x'==.
}

* Presample fixed effect
gen miYear = minYear
replace miYear = 1978 if miYear<1978
bysort gvkey: egen pSforwardCites_scaled= mean(forwardCites_scaled) if year<miYear & year>=miYear-5 
bysort gvkey: egen pSmaxforwardCites_scaled = max(pSforwardCites_scaled)
gen pre1978SforwardCites_scaled=pSmaxforwardCites_scaled
replace pre1978SforwardCites_scaled = 0 if pre1978SforwardCites_scaled==.
drop pSmaxforwardCites_scaled pSforwardCites_scaled



*Drop unreasonable high (or low) productivities
gen bangForBuck = (cumRD/(patentCount+1))
bysort FirmName: egen sdBang = sd(bangForBuck)
replace sdBang = 0 if sdBang==.
sum sdBang, detail
drop if sdBang>r(p99)
drop bangForBuck sdBang

* Drop companies without a patent in the considered time period
bysort gvkey: egen temp = total(patentCount)
drop if temp==0
drop temp
cap drop _merge
*joinby gvkey  using D:\Dropbox\Spillovers\prog\choice_2, unmatched(both)
save proc\datasetPart2.dta, replace
keep gvkey
duplicates drop gvkey, force
ren gvkey gvkey
save proc\IDEst, replace
}
}

***********************************************************************************************
* VI) Patent Descriptives for the different company types
***********************************************************************************************
{
* Patent descriptives file
{
use proc\patentsComplete_Full, clear
keep if venture==1 | established==1
merge m:1 ID using proc\firstLastInvestment

gen age = year-firstInvestment
ren year appyear
replace venture = 0 if (venture==1)  & appyear>lastInvestment
replace established = 0 if venture==1 & (appyear>=firstInvestment | appyear<=lastInvestment)
replace venture = 0 if established==1
 
replace gvkey=. if venture==1
replace ID = . if established==1

drop if ID==. & gvkey==. 

cap drop _merge
//drop if appyear<1990
cap drop patentCount
gen patentCount=1
drop if patentCount==.
drop if appyear==.
drop if appyear<1978
drop if  nclass==473
save proc\patentsDescriptive, replace
}

* Pure classification file
{
use proc\patentsComplete, clear
keep subcat  nclass
duplicates drop nclass, force


label define		subcatLabel		11		"Agriculture, Food, Textiles"	, modify
label define		subcatLabel		12		"Coating"	, modify
label define		subcatLabel		13		"Gas"	, modify
label define		subcatLabel		14		"Organic Compounds"	, modify
label define		subcatLabel		15		"Resins"	, modify
label define		subcatLabel		19		"Misc-chemical"	, modify
label define		subcatLabel		21		"Communications"	, modify
label define		subcatLabel		22		"Computer Hardware & Software"	, modify
label define		subcatLabel		23		"Computer Peripherals"	, modify
label define		subcatLabel		24		"Information Storage"	, modify
label define		subcatLabel		25		"Misc-Computer"	, modify
label define		subcatLabel		31		"Drugs"	, modify
label define		subcatLabel		32		"Surgery & Medical Instruments"	, modify
label define		subcatLabel		33		"Biotechnology"	, modify
label define		subcatLabel		39		"Misc-Drugs&Med"	, modify
label define		subcatLabel		41		"Electrical Devices"	, modify
label define		subcatLabel		42		"Electrical Lighting"	, modify
label define		subcatLabel		43		"Measuring & Testing"	, modify
label define		subcatLabel		44		"Nuclear & X-Rays"	, modify
label define		subcatLabel		45		"Power Systems"	, modify
label define		subcatLabel		46		"Semiconductor Devices"	, modify
label define		subcatLabel		49		"Miscellaneous-Elec"	, modify
label define		subcatLabel		51		"Material Processing & Handling"	, modify
label define		subcatLabel		52		"Metal Working"	, modify
label define		subcatLabel		53		"Motors Engines & Parts"	, modify
label define		subcatLabel		54		"Optics"	, modify
label define		subcatLabel		55		"Transportation"	, modify
label define		subcatLabel		59		"Misc-Mechanical"	, modify
label define		subcatLabel		61		"Agriculture, Husbandry, Food"	, modify
label define		subcatLabel		62		"Amusement Devices"	, modify
label define		subcatLabel		63		"Apparel & Textiles"	, modify
label define		subcatLabel		64		"Earth Working & Wells"	, modify
label define		subcatLabel		65		"Furniture, House Fixtures"	, modify
label define		subcatLabel		66		"Heating"	, modify
label define		subcatLabel		67		"Pipe & Joints"	, modify
label define		subcatLabel		68		"Receptacles"	, modify
label define		subcatLabel		69		"Misc-Others"	, modify
label values subcat subcatLabel
save proc\patentCategory, replace
}
* Cross citing matrix
{
use proc\patentsComplete_Full, clear
ren year appyear

keep patent  ID gvkey appyear nclass   subcat

merge m:1 ID using proc\IDVen
gen mVen = _merge
drop _merge

merge m:1 gvkey using proc\IDEst
gen mEst = _merge
drop _merge
keep if mVen==3 | mEst==3

keep patent  ID gvkey appyear nclass   subcat
save proc\patentMerge, replace


ren patent citing
foreach x in nclass   subcat ID gvkey appyear {
ren `x' `x'To
}



joinby citing using proc\citations.dta, unmatched(master)
ren cited patent 
cap drop _merge
joinby patent using proc\patentMerge, unmatched(both)

foreach x in nclass  ID gvkey subcat appyear {
ren `x' `x'Org
ren `x'To `x'
}



gen one =1
drop if nclass==.
drop if nclassOrg==.
save proc\patentCitesCompanyData, replace







* Necessary because the larger set has more nclasses
use proc\patentCitesCompanyData, clear
keep nclass appyear
duplicates drop nclass appyear, force
save proc\nselectCompany, replace


use proc\patentsComplete, clear
merge m:1 nclass appyear using proc\nselectCompany
keep if _merge==3
drop _merge

keep patent appyear nclass  subcat

ren patent citing
foreach x in nclass   subcat appyear {
ren `x' `x'To
}

joinby citing using proc\citations.dta, unmatched(master)
ren cited patent 
cap drop _merge
joinby patent using proc\patentClass, unmatched(both)
cap drop _merge
joinby patent using proc\patentAppyearGyear, unmatched(both)
cap drop _merge
foreach x in nclass   appyear {
ren `x' `x'Org
ren `x'To `x'
}

gen age = appyear - appyearOrg
drop if age<=0 | age>5


gen one =1
drop if nclass==.
drop if nclassOrg==.

collapse (sum) one , by(nclass nclassOrg appyear)
save proc\crossCitationPattern, replace


egen panelVar=group(nclass nclassOrg)
tsset panelVar appyear
drop if appyear==.
tsfill, full
gsort panelVar + appyear
replace nclass=nclass[_n-1] if panelVar == panelVar[_n-1] & nclass[_n-1]!=.
replace nclassOrg=nclassOrg[_n-1] if panelVar == panelVar[_n-1] & nclassOrg[_n-1]!=.

gsort panelVar - appyear
replace nclass=nclass[_n-1] if panelVar == panelVar[_n-1] & nclass[_n-1]!=.
replace nclassOrg=nclassOrg[_n-1] if panelVar == panelVar[_n-1] & nclassOrg[_n-1]!=.

replace one = 0 if one==.

sort nclassOrg nclass appyear
gen oneSum=0
drop panelVar
egen panelVar=group(nclass nclassOrg)
drop if appyear==.
tsset panelVar appyear
replace oneSum = one


foreach x in one  oneSum {
bysort nclassOrg appyear: egen summer = total(`x')
bysort nclass appyear: egen summer1 = total(`x')
replace summer=0 if summer==.
replace summer1=0 if summer1==.
gen `x'Absolut = `x'
gen `x'Basic = `x'/(summer1)
replace `x' = `x'/(summer)
drop summer summer1
}
sort panelVar appyear
replace appyear = appyear+1
drop panelVar
save nclassNclassLikelihood, replace
use nclassNclassLikelihood, replace
collapse (sum) one oneSum oneAbsolut oneBasic oneSumAbsolut oneSumBasic, by(nclass nclassOrg appyear)
save proc\nclassNclassLikelihood, replace
}

}

***************************************************************************************
* VII) Calculate Technological Distance between each company for every year
***************************************************************************************
{
* Vector of patent across tech classes
{
use proc\patentsDescriptive, clear

drop if ID==0 & gvkey==0
replace ID =0 if ID==.
replace gvkey=0 if gvkey==.
replace ID = 0 if venture==0
replace gvkey=0 if established==0
keep patentCount ID gvkey appyear nclass forwardCites backwardCites

egen groupID = group(ID gvkey)
gcollapse (sum) patentCount forwardCites backwardCites,by(gvkey nclass ID groupID appyear)
egen groupIDN = group(ID gvkey nclass)
duplicates drop groupIDN appyear, force
xtset groupIDN appyear
tsfill, full
replace patentCount = 0 if patentCount==.
foreach x in ID gvkey nclass groupID {
gsort groupIDN appyear
replace `x' = `x'[_n-1] if groupIDN == groupIDN[_n-1] & `x'[_n-1]!=.
gsort groupIDN - appyear
replace `x' = `x'[_n-1] if groupIDN == groupIDN[_n-1] & `x'[_n-1]!=.
}
replace forwardCites=0 if forwardCites==.
replace backwardCites=0 if backwardCites==.
gsort groupIDN appyear
foreach x in patentCount forwardCites backwardCites {
gen `x'Cum = 0
replace `x'Cum = `x' + `x'Cum[_n-1] if groupIDN==groupIDN[_n-1] & `x'Cum !=.
}
drop groupIDN 

foreach x in patentCount forwardCites backwardCites {
drop `x'
ren `x'Cum `x'
}

drop if ID==0 & gvkey==0
drop if nclass==.

merge m:1 ID using proc\IDVen
gen mVen = _merge
drop _merge

merge m:1 gvkey using proc\IDEst
gen mEst = _merge
drop _merge
keep if mVen==3 | mEst==3
cap drop _merge

merge m:1 nclass using proc\nclassSelectCompany
keep if _merge==3

keep ID nclass gvkey patentCount appyear groupID
compress
drop groupID
egen nclassID=group(nclass)
drop nclass
drop if nclassID==.
compress
levelsof(nclassID), local(nclassLevels)
reshape wide patentCount, i(gvkey ID appyear) j(nclassID) 
foreach l of local nclassLevels {
replace patentCount`l' = 0 if patentCount`l'==.
}
order gvkey ID appyear
saveold proc\merger, replace
*/
***Split
forvalues k = 1978/2006 {
use proc\merger, clear
keep if appyear==`k' 
drop appyear 
compress
saveold proc\mergerData_`k', replace  version(12)
}
}

* Weights for the Company-Company Distance
{
use proc\nclassNclassLikelihood, clear
keep appyear nclass nclassOrg oneSumAbsolut
merge m:1 nclass appyear using proc\nselectCompany
keep if _merge==3
drop _merge
drop if oneSumAbsolut==0
drop if nclass==473  | nclassOrg==473
save proc\companyCompanyDistance, replace
keep if nclass==nclassOrg

drop nclassOrg oneSumAbsolut appyear
duplicates drop nclass, force
expand 26
sort nclass
gen appyear=1978
replace appyear = appyear[_n-1]+1 if _n>1 & appyear[_n-1]<=2005
sort nclass appyear
duplicates drop nclass appyear, force
save proc\nclassSelection, replace

use proc\nclassSelection, clear
ren nclass nclassOrg 
drop appyear
duplicates drop nclassOrg, force
cross using proc\nclassSelection
sort nclass nclassOrg appyear
save proc\nclassSelection, replace
use proc\nclassSelection, clear
drop nclassOrg appyear
duplicates drop nclass, force

save proc\nclassSelectCompany, replace

use proc\companyCompanyDistance, clear
joinby nclass nclassOrg appyear using proc\nclassSelection, unmatched(both)
drop if _merge==1
drop _merge
replace oneSumAbsolut=0 if oneSumAbsolut==.
save proc\companyCompanyDistance, replace


* Weighing Matrix Cites
use  proc\companyCompanyDistance, clear
compress
egen nclassID  = group(nclassOrg)
drop nclassOrg
duplicates drop appyear nclass nclassID, force
levelsof(nclassID), local(nclassLevels)
reshape wide oneSumAbsolut, i(appyear nclass) j(nclassID)

foreach x of local nclassLevels {
replace oneSumAbsolut`x' = 0 if oneSumAbsolut`x' ==.
}
collapse (sum) oneSumAbsolut*, by(nclass)
forvalues k = 1978/2006 {
saveold proc\mergeCitesWeight_`k', replace  version(12)
}
}

* Identifier & Funding Data
{
use proc\mergerData_2005, clear
drop if ID==gvkey
keep ID gvkey
saveold proc\identifierMerger, replace

use proc\datasetPart2, clear
cap drop ID
save proc\datasetPart2Append, replace
use proc\datasetPart1, clear

keep ID  year  $Investment FundSize
append using proc\datasetPart2Append
foreach x in $Investment {
replace `x' = 0 if `x'==.
}
replace cumRD = 0 if cumRD==.
replace gvkey = 0 if gvkey==.
replace ID = 0 if ID==.
keep ID gvkey year  $Investment FundSize cumRD cumRD_p rdCosts
order ID gvkey year cumRD cumRD_p rdCosts $Investment FundSize
saveold proc\identifierFunding, replace  version(12)


forvalues k = 1978/2006 {
use proc\identifierMerger, clear
gen year=`k'
merge 1:1 ID gvkey year using proc\identifierFunding, update force
drop if _merge==2
drop _merge
foreach x in cumRD cumRD_p rdCosts $Investment FundSize {
replace `x'=0 if `x'==.
}
drop year
compress
sort gvkey ID 
order gvkey ID cumRD cumRD_p rdCosts  $Investment FundSize
saveold proc\funding_`k',replace version(12)
}

}








* Run R File
shell "C:\Program Files\R\R-4.0.0\bin\x64\R.exe" CMD BATCH  "distances_working.R" myreport.txt



* Merger Spillover Files together
{

use proc\CitesWeightOut_1979, clear 
gen year=1979
save proc\merger_1979, replace

forvalues k= 1980/2006 {
use proc\CitesWeightOut_`k', clear
gen year=`k' 
local l = `k'-1
di `l'
append using proc\merger_`l'

local j = `k'
save proc\merger_`j', replace

}
compress
drop if C_cumRD==0 & J_AmountCum==0
save proc\mergerSpillovers, replace

unab k:  J_* C_*
global spillovers `k'

use proc\datasetPart2, clear
cap drop ID
save proc\datasetPart2Append, replace

use proc\datasetPart1, clear
append using proc\datasetPart2Append
foreach x in ID gvkey  {
replace `x'=0 if `x'==.
}

joinby ID gvkey year using proc\method1, unmatched(master) update replace



global result
keep ID gvkey year cumRD cumRD_p  weight* ID1 gvkey1 year1  AmountCum AmountCum_p
foreach x in cumRD  cumRD_p    AmountCum AmountCum_p  {
replace `x'=0 if `x'==.
cap gen w12_`x' = weight12*`x'
cap replace w12_`x'= 0 if w12_`x'==.
global result $result w12_`x'
}

gen weight12Venture = weight12 *(ID!=0)
gen weight12Est = weight12*(ID==0)

keep $result weight*  ID1 gvkey1 year1
compress
gcollapse (sum)  $result weight*, by(ID1 gvkey1 year1)
ren ID1 ID 
ren gvkey1 gvkey
ren year1 year
drop if year<1979 | year>2006

save proc\method1Complementary,replace


use proc\DistanceC_1979, clear
gen year=1979
forvalues y = 1980/1999 {
append using proc\DistanceC_`y'
replace year = `y' if year==.
}
save proc\distance, replace
}
}



***************************************************************************************************
* VIII) Create Final dataset
***************************************************************************************************
{

use proc\datasetPart2, clear
cap drop ID
save proc\datasetPart2Append, replace

use proc\datasetPart1, clear
append using proc\datasetPart2Append
foreach x in ID gvkey  {
replace `x'=0 if `x'==.
}

drop venture
gen venture = (ID!=0)
gen established = (ID==0)
replace rdCosts=0 if rdCosts==.
replace lnSale = 0 if lnSale==.
drop if cumRD==0 & established==1
drop if xrd==. & established==1
drop if xrd==0 & established==1


merge 1:1 gvkey ID year using proc\mergerSpillovers.dta 
keep  if _merge==3
drop _merge

merge 1:1 ID gvkey year using proc\method1Complementary
drop if _merge==2
drop _merge

compress


* Standardize Variables

sort ID gvkey year
egen identifier = group(ID gvkey)

global outcomes  above outstanding patentCount originality generality  forwardCites  originality_scaled generality_scaled  forwardCites_scaled     
global relevantFeatures  xrd   cumRD  AmountCum Amount
global preFixedEffects  
global independent  xrd   sale 
foreach x in $relevantFeatures $true_char   $preFixedEffects $result $spillovers independent  {
cap replace `x'=0 if `x'==.
cap  gen `x'_base = `x'
cap replace `x' = ln(`x'+1)
}
foreach x in $outcomes $method1Outcome {
cap  replace `x'=0 if `x'==.
cap  gen `x'_base = `x'
cap  gen `x'_ln = ln(`x'+1)
cap  replace `x' = ln(`x'+1)

}


merge m:1 ID gvkey  using proc\Distance_RFILE
drop if _merge==2
replace weightCites = weightCites*100






cap drop mComp
bysort gvkey ID: egen mComp=mean(complexity/patentCount_base)

	
gen preSample = 0
gen dum_preSample=0
replace preSample = pre1978SforwardCites_scaled if pre1978SforwardCites_scaled!=.
replace dum_preSample = (pre1978SforwardCites_scaled==0)
label var preSample "Pre-sample FE"
replace forwardCites_scaled = forwardCites_scaled*100


drop if year<1978 | year>1999


* Drop small industires
cap drop ser 
bysort SIC venture: egen ser = nvals(identifier)
drop if ser<10
drop if (cumRD==0 | cumRD==.)  & (AmountCum==0 | AmountCum==.)
*keep ID year Amount SIC firstInvestment forwardCites_scaled pre1978SforwardCites_scaled FundSize FundSizeUSDMil FundSizeUSDMilRaw FundVenture AmountCum gvkey sale xrd rdCosts cumRD lnSale J_cumRD J_cumRD_p J_AmountCum J_AmountCum_p C_cumRD C_cumRD_p C_AmountCum_High C_AmountCum_High_p C_AmountCum_Low C_AmountCum_Low_p C_AmountCum C_AmountCum_p C_AmountCumCorr C_AmountCumCorr_p C_AC_inv_festablished_1 C_AC_inv_festablished_1_p C_AC_inv_festablished_2 C_AC_inv_festablished_2_p C_AC_inv_firstTime_1 C_AC_inv_firstTime_1_p C_AC_inv_firstTime_2 C_AC_inv_firstTime_2_p C_AC_g_pat_ab_1 C_AC_g_pat_ab_1_p C_AC_g_pat_ab_2 C_AC_g_pat_ab_2_p C_AC_g_pat_75_ab_1 C_AC_g_pat_75_ab_1_p C_AC_g_pat_75_ab_2 C_AC_g_pat_75_ab_2_p C_AC_i_pat_ab_1 C_AC_i_pat_ab_1_p C_AC_i_pat_ab_2 C_AC_i_pat_ab_2_p C_AC_w_pat_ab_1 C_AC_w_pat_ab_1_p C_AC_w_pat_ab_2 C_AC_w_pat_ab_2_p C_AC_lf_claim_res_min_1 C_AC_lf_claim_res_min_1_p C_AC_lf_claim_res_min_2 C_AC_lf_claim_res_min_2_p C_AC_prox_bi_1 C_AC_prox_bi_1_p C_AC_prox_bi_2 C_AC_prox_bi_2_p C_AC_prox_1 C_AC_prox_1_p C_AC_prox_2 C_AC_prox_2_p C_AC_complexity_1 C_AC_complexity_1_p C_AC_complexity_2 C_AC_complexity_2_p C_AC_forwardCites_scaled_pP_1 C_AC_forwardCites_scaled_pP_1_p C_AC_forwardCites_scaled_pP_2 C_AC_forwardCites_scaled_pP_2_p C_AC_generality_scaled_pP_1 C_AC_generality_scaled_pP_1_p C_AC_generality_scaled_pP_2 C_AC_generality_scaled_pP_2_p C_AC_originality_scaled_pP_1 C_AC_originality_scaled_pP_1_p C_AC_originality_scaled_pP_2 C_AC_originality_scaled_pP_2_p C_AC_festablished_0_0 C_AC_festablished_0_0_p C_AC_festablished_0_1 C_AC_festablished_0_1_p C_AC_festablished_1_0 C_AC_festablished_1_0_p C_AC_festablished_1_1 C_AC_festablished_1_1_p C_AC_pat_firstTime_0_0 C_AC_pat_firstTime_0_0_p C_AC_pat_firstTime_1_0 C_AC_pat_firstTime_1_0_p C_AC_pat_firstTime_0_1 C_AC_pat_firstTime_0_1_p C_AC_pat_firstTime_1_1 C_AC_pat_firstTime_1_1_p C_AC_gi_75_ab_0_0 C_AC_gi_75_ab_0_0_p C_AC_gi_75_ab_0_1 C_AC_gi_75_ab_0_1_p C_AC_gi_75_ab_1_0 C_AC_gi_75_ab_1_0_p C_AC_gi_75_ab_1_1 C_AC_gi_75_ab_1_1_p venture established w12_cumRD w12_cumRD_p w12_AmountCum w12_AmountCum_p weight12Venture weight12Est xrd_base cumRD_base AmountCum_base w12_cumRD_base w12_AmountCum_base J_cumRD_base J_AmountCum_base C_cumRD_base C_AmountCum_base C_AmountCumCorr_base above_base outstanding_base patentCount_base originality_base generality_base forwardCites_base originality_scaled_base generality_scaled_base forwardCites_scaled_base age identifier mComp preSample dum_preSample 


save proc\workFull, replace
use proc\workFull, replace

drop if AmountCum==0 & venture==1
drop if year<firstInvestment & venture==1


cap drop _merge
merge m:1 ID gvkey year using proc\distance
keep if _merge==3
drop _merge

* Keep only relevant variables
*keep ID year Amount SIC firstInvestment forwardCites_scaled pre1978SforwardCites_scaled FundSize FundSizeUSDMil FundSizeUSDMilRaw FundVenture AmountCum gvkey sale xrd rdCosts cumRD lnSale J_cumRD J_cumRD_p J_AmountCum J_AmountCum_p C_cumRD C_cumRD_p C_AmountCum_High C_AmountCum_High_p C_AmountCum_Low C_AmountCum_Low_p C_AmountCum C_AmountCum_p C_AmountCumCorr C_AmountCumCorr_p C_AC_inv_festablished_1 C_AC_inv_festablished_1_p C_AC_inv_festablished_2 C_AC_inv_festablished_2_p C_AC_inv_firstTime_1 C_AC_inv_firstTime_1_p C_AC_inv_firstTime_2 C_AC_inv_firstTime_2_p C_AC_g_pat_ab_1 C_AC_g_pat_ab_1_p C_AC_g_pat_ab_2 C_AC_g_pat_ab_2_p C_AC_g_pat_75_ab_1 C_AC_g_pat_75_ab_1_p C_AC_g_pat_75_ab_2 C_AC_g_pat_75_ab_2_p C_AC_i_pat_ab_1 C_AC_i_pat_ab_1_p C_AC_i_pat_ab_2 C_AC_i_pat_ab_2_p C_AC_w_pat_ab_1 C_AC_w_pat_ab_1_p C_AC_w_pat_ab_2 C_AC_w_pat_ab_2_p C_AC_lf_claim_res_min_1 C_AC_lf_claim_res_min_1_p C_AC_lf_claim_res_min_2 C_AC_lf_claim_res_min_2_p C_AC_prox_bi_1 C_AC_prox_bi_1_p C_AC_prox_bi_2 C_AC_prox_bi_2_p C_AC_prox_1 C_AC_prox_1_p C_AC_prox_2 C_AC_prox_2_p C_AC_complexity_1 C_AC_complexity_1_p C_AC_complexity_2 C_AC_complexity_2_p C_AC_forwardCites_scaled_pP_1 C_AC_forwardCites_scaled_pP_1_p C_AC_forwardCites_scaled_pP_2 C_AC_forwardCites_scaled_pP_2_p C_AC_generality_scaled_pP_1 C_AC_generality_scaled_pP_1_p C_AC_generality_scaled_pP_2 C_AC_generality_scaled_pP_2_p C_AC_originality_scaled_pP_1 C_AC_originality_scaled_pP_1_p C_AC_originality_scaled_pP_2 C_AC_originality_scaled_pP_2_p C_AC_festablished_0_0 C_AC_festablished_0_0_p C_AC_festablished_0_1 C_AC_festablished_0_1_p C_AC_festablished_1_0 C_AC_festablished_1_0_p C_AC_festablished_1_1 C_AC_festablished_1_1_p C_AC_pat_firstTime_0_0 C_AC_pat_firstTime_0_0_p C_AC_pat_firstTime_1_0 C_AC_pat_firstTime_1_0_p C_AC_pat_firstTime_0_1 C_AC_pat_firstTime_0_1_p C_AC_pat_firstTime_1_1 C_AC_pat_firstTime_1_1_p C_AC_gi_75_ab_0_0 C_AC_gi_75_ab_0_0_p C_AC_gi_75_ab_0_1 C_AC_gi_75_ab_0_1_p C_AC_gi_75_ab_1_0 C_AC_gi_75_ab_1_0_p C_AC_gi_75_ab_1_1 C_AC_gi_75_ab_1_1_p venture established w12_cumRD w12_cumRD_p w12_AmountCum w12_AmountCum_p weight12Venture weight12Est xrd_base cumRD_base AmountCum_base w12_cumRD_base w12_AmountCum_base J_cumRD_base J_AmountCum_base C_cumRD_base C_AmountCum_base C_AmountCumCorr_base above_base outstanding_base patentCount_base originality_base generality_base forwardCites_base originality_scaled_base generality_scaled_base forwardCites_scaled_base age identifier mComp preSample dum_preSample weightCites weightVentureJaffe weightEstJaffe weightVentureCites weightEstCites weightVentureMal weightEstMal
* Labels
label var cumRD "ln(R\&D Stock)"
label var AmountCum "ln(VC Stock)"

save proc\work, replace
}

***************************************************************************************************
* IX) Anonymize and label data
***************************************************************************************************
{
use proc\ventureDescriptive, clear
bysort ID: egen patent = max(_merge==3)
keep if fundingRoundNumber==1
collapse (sum) fundingRoundNumber patent, by(year)
gen share = patent/fundingRoundNumber
label var fundingRoundNumber "# of Start-ups"
label var share "% with Patent"
save proc\ventureDescriptive_aggregated, replace


use proc\workFull, replace
ren year appyear
drop if appyear<1979
collapse (sum)  patentCount_base   forwardCites_base  forwardCites_scaled_base, by(appyear venture)

foreach x in forwardCites_base forwardCites_scaled_base{
replace `x' = `x' / patentCount_base
}

label var forwardCites_base "Average citations"
label var forwardCites_scaled_base "Average scaled forward citations"

label var patentCount_base "# patents"
save proc\work_aggregated, replace

use proc\patentRaw, clear
gen patentCount = 1 
gcollapse (sum) patentCount  (mean)  forwardCites, by(appyear)
drop if appyear<1975
label var patentCount "# of Patents"
label var forwardCites "Average Cites per Patent"
save proc\patentRaw_aggregated, replace


use work, replace
merge m:1 ID gvkey  using Distance_RFILE
drop if _merge==2
replace weightCites = weightCites*100
cap drop _merge
merge m:1 ID gvkey year using distance
keep if _merge==3
drop _merge
keep ID year Amount SIC firstInvestment forwardCites_scaled pre1978SforwardCites_scaled FundSize FundSizeUSDMil FundSizeUSDMilRaw FundVenture AmountCum gvkey sale xrd rdCosts cumRD lnSale J_cumRD J_cumRD_p J_AmountCum J_AmountCum_p C_cumRD C_cumRD_p C_AmountCum_High C_AmountCum_High_p C_AmountCum_Low C_AmountCum_Low_p C_AmountCum C_AmountCum_p C_AmountCumCorr C_AmountCumCorr_p C_AC_inv_festablished_1 C_AC_inv_festablished_1_p C_AC_inv_festablished_2 C_AC_inv_festablished_2_p C_AC_inv_firstTime_1 C_AC_inv_firstTime_1_p C_AC_inv_firstTime_2 C_AC_inv_firstTime_2_p C_AC_g_pat_ab_1 C_AC_g_pat_ab_1_p C_AC_g_pat_ab_2 C_AC_g_pat_ab_2_p C_AC_g_pat_75_ab_1 C_AC_g_pat_75_ab_1_p C_AC_g_pat_75_ab_2 C_AC_g_pat_75_ab_2_p C_AC_i_pat_ab_1 C_AC_i_pat_ab_1_p C_AC_i_pat_ab_2 C_AC_i_pat_ab_2_p C_AC_w_pat_ab_1 C_AC_w_pat_ab_1_p C_AC_w_pat_ab_2 C_AC_w_pat_ab_2_p C_AC_lf_claim_res_min_1 C_AC_lf_claim_res_min_1_p C_AC_lf_claim_res_min_2 C_AC_lf_claim_res_min_2_p C_AC_prox_bi_1 C_AC_prox_bi_1_p C_AC_prox_bi_2 C_AC_prox_bi_2_p C_AC_prox_1 C_AC_prox_1_p C_AC_prox_2 C_AC_prox_2_p C_AC_complexity_1 C_AC_complexity_1_p C_AC_complexity_2 C_AC_complexity_2_p C_AC_forwardCites_scaled_pP_1 C_AC_forwardCites_scaled_pP_1_p C_AC_forwardCites_scaled_pP_2 C_AC_forwardCites_scaled_pP_2_p C_AC_generality_scaled_pP_1 C_AC_generality_scaled_pP_1_p C_AC_generality_scaled_pP_2 C_AC_generality_scaled_pP_2_p C_AC_originality_scaled_pP_1 C_AC_originality_scaled_pP_1_p C_AC_originality_scaled_pP_2 C_AC_originality_scaled_pP_2_p C_AC_festablished_0_0 C_AC_festablished_0_0_p C_AC_festablished_0_1 C_AC_festablished_0_1_p C_AC_festablished_1_0 C_AC_festablished_1_0_p C_AC_festablished_1_1 C_AC_festablished_1_1_p C_AC_pat_firstTime_0_0 C_AC_pat_firstTime_0_0_p C_AC_pat_firstTime_1_0 C_AC_pat_firstTime_1_0_p C_AC_pat_firstTime_0_1 C_AC_pat_firstTime_0_1_p C_AC_pat_firstTime_1_1 C_AC_pat_firstTime_1_1_p C_AC_gi_75_ab_0_0 C_AC_gi_75_ab_0_0_p C_AC_gi_75_ab_0_1 C_AC_gi_75_ab_0_1_p C_AC_gi_75_ab_1_0 C_AC_gi_75_ab_1_0_p C_AC_gi_75_ab_1_1 C_AC_gi_75_ab_1_1_p venture established w12_cumRD w12_cumRD_p w12_AmountCum w12_AmountCum_p weight12Venture weight12Est xrd_base cumRD_base AmountCum_base w12_cumRD_base w12_AmountCum_base J_cumRD_base J_AmountCum_base C_cumRD_base C_AmountCum_base C_AmountCumCorr_base above_base outstanding_base patentCount_base originality_base generality_base forwardCites_base originality_scaled_base generality_scaled_base forwardCites_scaled_base age identifier mComp preSample dum_preSample weightCites weightVentureJaffe weightEstJaffe weightVentureCites weightEstCites weightVentureMal weightEstMal

foreach x in gvkey ID  {
egen g_ID = group(`x')
replace `x' = g_ID
drop g_ID
}
save work, replace


use work, clear
label var forwardCites_base "Citation-weighted patents p.a."
label var forwardCites_scaled_base "Scaled citation-weighted patents p.a."
label var year "Calendar year (pseudo)"
label var patentCount_base "Patent count (per annum)"
label var year "Number of Years in Data"
label var xrd_base "Average R\&D (million dollars)"
label var sale "Average sales (billion dollars)"
label var mComp "Average complexity of companies' patents"
label var weightCites "Average citation-augmented proximity (x 100)"
label var patentCount_base "Patent count (per annum)"
label var Amount "Total VC investment (million dollars)"
label var mComp "Average complexity of companies' patents"
label var weightCites "Average citation-augmented proximity (x 100)"
label var patentCount_base "\# patents"
label var above_base "I(Best 50\% of citations)"
label var outstanding_base    "I(Best 5\% of citations)"
label var forwardCites_base  "Forward citations"
label var forwardCites_scaled_base "Scaled citations"
label var generality_base  "Generality"
label var generality_scaled_base  "Scaled generality"
label var originality_base  "Originality"
label var originality_scaled_base  "Scaled originality"
label var SIC "SIC Code (pseudo)"
label var gvkey "Compustat identifier (pseudo)"
label var ID "VentureXpert identifier (pseudo)"
label var firstInvestment "Year of first investment (pseudo)"

label var forwardCites_scaled "log Scaled citation-weighted patents p.a."
label var pre1978SforwardCites_scaled "Pre-sample fixed effect"
label var rdCosts "R&D costs per firm"
label var lnSale "log sales"
label var cumRD "ln(R&D Stock)"
label var AmountCum "ln(VC Stock)"
label var xrd "R&D spending"
label var xrd_base "R&D spending"
label var cumRD_base "R&D Stock"
label var AmountCum_base "VC Stock"

label var w12_cumRD_base 
label var w12_AmountCum_base 
label var J_cumRD_base "Spillover Est.  - Jaffe distance"
label var J_AmountCum_base  "Spillover VC  - Jaffe distance"
label var C_cumRD_base  "Spillover Est.  - Citation-augmented distance"
label var C_AmountCum_base "Spillover VC  - Citation-augmented distance"
label var C_AmountCumCorr_base "Spillover VC adjusted  - Citation-augmented distance"

label var J_cumRD "Ln(Spillover Est.)  - Jaffe distance"
label var J_cumRD_p "Ln(Spillover Est.) predicted  - Jaffe distance"

label var C_cumRD_p "Ln(Spillover Est.) predicted  - Citation-augmented distance"
label var C_cumRD "Ln(Spillover Est.)  -  Citation-augmented distance"

label var w12_cumRD_p "Ln(Spillover Est.) predicted  - Direct citation distance"
label var w12_cumRD "Ln(Spillover Est.)  - Direct citation distance"

label var J_AmountCum "Ln(Spillover VC)  - Jaffe distance"
label var J_AmountCum_p "Ln(Spillover VC) predicted  - Jaffe distance"

label var C_AmountCum_p "Ln(Spillover VC) predicted  - Citation-augmented distance"
label var C_AmountCum "Ln(Spillover VC)  -  Citation-augmented distance"

label var w12_AmountCum_p "Ln(Spillover VC) predicted  - Direct citation distance"
label var w12_AmountCum "Ln(Spillover VC)  - Direct citation distance"
label var established "Indicator whether established company"
label var C_AmountCum_High "Ln(Spillover Est.)  -  Pre patent"
label var  C_AmountCum_High_p "Ln(Spillover Est.)  -  Pre patent - predicted"
label var  C_AmountCum_Low "Ln(Spillover VC)  -  No pre patent"
label var C_AmountCum_Low_p "Ln(Spillover VC)  -  Pre patent - predicted"

label var C_AmountCumCorr  "Ln(Spillover VC) predicted  - Citation-augmented distance - corrected"
label var C_AmountCumCorr_p  "Ln(Spillover VC) predicted  - Citation-augmented distance - corrected - predicted"
label var C_AC_inv_festablished_1_p "Ln(Spillover VC) - without future patent - predicted"
label var C_AC_inv_festablished_2 "Ln(Spillover VC) - with future patent"
label var C_AC_inv_festablished_1 "Ln(Spillover VC) - without future patent"
label var C_AC_inv_festablished_2_p "Ln(Spillover VC) - with future patent - predicted"
label var C_AC_inv_firstTime_1 "Ln(Spillover VC) - first patent"

label var C_AC_inv_firstTime_1_p "Ln(Spillover VC) - first patent - predicted"
label var C_AC_inv_firstTime_2 "Ln(Spillover VC) - not first patent"
label var C_AC_inv_firstTime_2_p "Ln(Spillover VC) - not first patent - predicted"
label var C_AC_g_pat_ab_1 "Ln(Spillover VC) - below median novelty"
label var C_AC_g_pat_ab_1_p "Ln(Spillover VC) - below median novelty - predicted"
label var C_AC_g_pat_ab_2 "Ln(Spillover VC) - above median novelty"
label var C_AC_g_pat_ab_2_p "Ln(Spillover VC) - above median novelty - predicted"
label var C_AC_g_pat_75_ab_1 "Ln(Spillover VC) - below 75pct novelty"
label var C_AC_g_pat_75_ab_1_p "Ln(Spillover VC) - below 75pct novelty - predicted"
label var C_AC_g_pat_75_ab_2 "Ln(Spillover VC) - above 75pct novelty"
label var C_AC_g_pat_75_ab_2_p "Ln(Spillover VC) - above 75pct novelty - predicted"
label var C_AC_i_pat_ab_1 "Ln(Spillover VC) - below median no figures"
label var C_AC_i_pat_ab_1_p "Ln(Spillover VC) - below median no figures - predicted"
label var C_AC_i_pat_ab_2 "Ln(Spillover VC) - above median no figures"
label var C_AC_i_pat_ab_2_p "Ln(Spillover VC) - above median no figures - predicted"
label var C_AC_w_pat_ab_1 "Ln(Spillover VC) - below median length first claim"
label var C_AC_w_pat_ab_1_p "Ln(Spillover VC) - below median length first claim  - predicted"
label var C_AC_w_pat_ab_2 "Ln(Spillover VC) - above median length first claim"
label var C_AC_w_pat_ab_2_p "Ln(Spillover VC) - above median length first claim - predicted"
label var C_AC_lf_claim_res_min_1 "Ln(Spillover VC) - below median of minimum of length of first claims among all patents of start-up"
label var C_AC_lf_claim_res_min_1_p "Ln(Spillover VC) - below median of minimum of length of first claims among all patents of start-up - predicted"
label var C_AC_lf_claim_res_min_2 "Ln(Spillover VC) - above median of minimum of length of first claims among all patents of start-up"
label var C_AC_lf_claim_res_min_2_p "Ln(Spillover VC) - above median of minimum of length of first claims among all patents of start-up - predicted"
label var C_AC_prox_bi_1 "Ln(Spillover VC) - sparse co-author network"
label var C_AC_prox_bi_1_p  "Ln(Spillover VC) - sparse co-author network - predicted"
label var C_AC_prox_bi_2 "Ln(Spillover VC) - dense co-author network"
label var C_AC_prox_bi_2 "Ln(Spillover VC) - dense co-author network - predicted"

label var C_AC_prox_1 "Ln(Spillover VC) - low labor mobility"
label var C_AC_prox_1_p "Ln(Spillover VC) - low labor mobility - predicted" 
label var C_AC_prox_2 "Ln(Spillover VC) - high labor mobility"
label var C_AC_prox_2_p "Ln(Spillover VC) - high labor mobility - predicted"
label var C_AC_complexity_1 "Ln(Spillover VC) - discrete technology industries"
label var C_AC_complexity_1_p "Ln(Spillover VC) - discrete technology industries - predicted"
label var C_AC_complexity_2 "Ln(Spillover VC) - complex technology industries"
label var C_AC_complexity_2_p "Ln(Spillover VC) - complex technology industries - predicted"
label var C_AC_forwardCites_scaled_pP_1 "Ln(Spillover VC) - below median scaled forward citations per patent"
label var C_AC_forwardCites_scaled_pP_1_p "Ln(Spillover VC) - below median scaled forward citations per patent - predicted"
label var C_AC_forwardCites_scaled_pP_2 "Ln(Spillover VC) - above median scaled forward citations per patent"
label var C_AC_forwardCites_scaled_pP_2_p "Ln(Spillover VC) - above median scaled forward citations per patent - predicted"
label var C_AC_generality_scaled_pP_1 "Ln(Spillover VC) - below median scaled generality per patent"
label var C_AC_generality_scaled_pP_1_p "Ln(Spillover VC) - below median scaled generality per patent - predicted"
label var C_AC_generality_scaled_pP_2 "Ln(Spillover VC) - above median scaled generality per patent"
label var C_AC_generality_scaled_pP_2_p "Ln(Spillover VC) - above median scaled generality per patent - predicted"
label var C_AC_originality_scaled_pP_1_p "Ln(Spillover VC) - below median scaled originality per patent - predicted"
label var C_AC_originality_scaled_pP_2_p "Ln(Spillover VC) - above median scaled originality per patent - predicted"
label var C_AC_originality_scaled_pP_1 "Ln(Spillover VC) - below median scaled originality per patent"
label var C_AC_originality_scaled_pP_2 "Ln(Spillover VC) - above median scaled originality per patent"

label var C_AC_festablished_0_0 "Ln(Spillover VC) - without future patent & not novel"
label var C_AC_festablished_0_0_p "Ln(Spillover VC) - without future patent & not novel - predicted"
label var C_AC_festablished_0_1 "Ln(Spillover VC) - without future patent & novel"
label var C_AC_festablished_0_1_p "Ln(Spillover VC) - without future patent &  novel - predicted"
label var C_AC_festablished_1_0 "Ln(Spillover VC) - with future patent & not novel"
label var C_AC_festablished_1_0_p "Ln(Spillover VC) - with future patent & not novel - predicted"
label var C_AC_festablished_1_1 "Ln(Spillover VC) - with future patent & novel"
label var C_AC_festablished_1_1_p "Ln(Spillover VC) - with future patent & novel - predicted"
label var C_AC_pat_firstTime_0_0 "Ln(Spillover VC) - not first patent & not novel"
label var C_AC_pat_firstTime_0_0_p "Ln(Spillover VC) - not first patent & not novel - predicted"
label var C_AC_pat_firstTime_1_0 "Ln(Spillover VC) - first patent & not novel"
label var C_AC_pat_firstTime_1_0_p "Ln(Spillover VC) - first patent & not novel - predicted"
label var C_AC_pat_firstTime_0_1 "Ln(Spillover VC) - not first patent & novel"
label var C_AC_pat_firstTime_0_1_p "Ln(Spillover VC) - not first patent & novel - predicted"
label var C_AC_pat_firstTime_1_1 "Ln(Spillover VC) - first patent & novel"
label var C_AC_pat_firstTime_1_1_p "Ln(Spillover VC) - first patent & novel - predicted"
label var C_AC_gi_75_ab_0_0 "Ln(Spillover VC) - below 75 pct novelty & below median no figures"
label var C_AC_gi_75_ab_0_0_p "Ln(Spillover VC) - below 75 pct novelty & below median no figures - predicted"
label var C_AC_gi_75_ab_0_1 "Ln(Spillover VC) - below 75 pct novelty & above median no figures"
label var C_AC_gi_75_ab_0_1_p "Ln(Spillover VC) - below 75 pct novelty & above median no figures  - predicted"
label var C_AC_gi_75_ab_1_0 "Ln(Spillover VC) - above 75 pct novelty & below median no figures"
label var C_AC_gi_75_ab_1_0_p "Ln(Spillover VC) - above 75 pct novelty & below median no figures - predicted"
label var C_AC_gi_75_ab_1_1 "Ln(Spillover VC) - above 75 pct novelty & above median no figures"
label var C_AC_gi_75_ab_1_1_p "Ln(Spillover VC) - above 75 pct novelty & above median no figures - predicted"


label var venture "Indicator VC financed start-up"

foreach x in  SIC {
egen g_ID = group(`x')
replace `x' = g_ID
drop g_ID
}

foreach x in  Amount FundSize FundSizeUSDMil FundSizeUSDMilRaw FundVenture sale xrd lnSale {
gen r = uniform()*0.1
replace `x' = `x'+r if `x'>0 & `x'!=.
drop r 
}

save work, replace



}


